<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "https://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" lang="en-US">
<head>
<meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
<meta http-equiv="X-UA-Compatible" content="IE=11"/>
<meta name="generator" content="Doxygen 1.12.0"/>
<meta name="viewport" content="width=device-width, initial-scale=1"/>
<title>NeuZephyr: nz::krnl Namespace Reference</title>
<link rel="icon" href="NZ_logo2.png" type="image/x-icon" />
<link href="tabs.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="jquery.js"></script>
<script type="text/javascript" src="dynsections.js"></script>
<link href="navtree.css" rel="stylesheet" type="text/css"/>
<script type="text/javascript" src="resize.js"></script>
<link href="doxygen.css" rel="stylesheet" type="text/css" />
</head>
<body>
<div id="top"><!-- do not remove this div, it is closed by doxygen! -->
<div id="titlearea">
<table cellspacing="0" cellpadding="0">
 <tbody>
 <tr id="projectrow">
  <td id="projectlogo"><img alt="Logo" src="NZ_logo2.png"/></td>
  <td id="projectalign">
   <div id="projectname">NeuZephyr
   </div>
   <div id="projectbrief">Simple DL Framework</div>
  </td>
 </tr>
 </tbody>
</table>
</div>
<!-- end header part -->
<!-- Generated by Doxygen 1.12.0 -->
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
$(function() { codefold.init(0); });
/* @license-end */
</script>
  <div id="navrow1" class="tabs">
    <ul class="tablist">
      <li><a href="index.html"><span>Main&#160;Page</span></a></li>
      <li><a href="pages.html"><span>Related&#160;Pages</span></a></li>
      <li class="current"><a href="namespaces.html"><span>Namespaces</span></a></li>
      <li><a href="annotated.html"><span>Classes</span></a></li>
      <li><a href="files.html"><span>Files</span></a></li>
    </ul>
  </div>
  <div id="navrow2" class="tabs2">
    <ul class="tablist">
      <li><a href="namespaces.html"><span>Namespace&#160;List</span></a></li>
      <li><a href="namespacemembers.html"><span>Namespace&#160;Members</span></a></li>
    </ul>
  </div>
<script type="text/javascript">
/* @license magnet:?xt=urn:btih:d3d9a9a6595521f9666a5e94cc830dab83b65699&amp;dn=expat.txt MIT */
$(function(){ initResizable(false); });
/* @license-end */
</script>
<div id="nav-path" class="navpath">
  <ul>
<li class="navelem"><b>nz</b></li><li class="navelem"><a class="el" href="namespacenz_1_1krnl.html">krnl</a></li>  </ul>
</div>
</div><!-- top -->
<div id="doc-content">
<div class="header">
  <div class="summary">
<a href="#func-members">Functions</a>  </div>
  <div class="headertitle"><div class="title">nz::krnl Namespace Reference</div></div>
</div><!--header-->
<div class="contents">

<p>High-Performance CUDA Kernel Implementations for Tensor Computations.  
<a href="#details">More...</a></p>
<table class="memberdecls">
<tr class="heading"><td colspan="2"><h2 class="groupheader"><a id="func-members" name="func-members"></a>
Functions</h2></td></tr>
<tr class="memitem:a97cda6dfc6545efaee2b686eed9ae766" id="r_a97cda6dfc6545efaee2b686eed9ae766"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a97cda6dfc6545efaee2b686eed9ae766">MatrixAdd</a> (dim3 gridDim, dim3 blockDim, float *a, float *b, float *c, unsigned long long n, size_t offset_c=0, size_t offset_a=0, size_t offset_b=0)</td></tr>
<tr class="memdesc:a97cda6dfc6545efaee2b686eed9ae766"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform matrix addition on GPU.  <br /></td></tr>
<tr class="separator:a97cda6dfc6545efaee2b686eed9ae766"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a5b29c405a1df9534430ad8682960ebb5" id="r_a5b29c405a1df9534430ad8682960ebb5"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5b29c405a1df9534430ad8682960ebb5">MatrixAdd</a> (dim3 gridDim, dim3 blockDim, float *a, float *b, float *c, unsigned long long n, const std::vector&lt; size_t &gt; &amp;offset_c, const std::vector&lt; size_t &gt; &amp;offset_a, const std::vector&lt; size_t &gt; &amp;offset_b)</td></tr>
<tr class="memdesc:a5b29c405a1df9534430ad8682960ebb5"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform matrix addition on GPU.  <br /></td></tr>
<tr class="separator:a5b29c405a1df9534430ad8682960ebb5"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:ad18a2b0efc0cdfc9cb861396ad4da53f" id="r_ad18a2b0efc0cdfc9cb861396ad4da53f"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad18a2b0efc0cdfc9cb861396ad4da53f">MatrixSub</a> (dim3 gridDim, dim3 blockDim, float *a, float *b, float *c, unsigned long long n, size_t offset_c=0, size_t offset_a=0, size_t offset_b=0)</td></tr>
<tr class="memdesc:ad18a2b0efc0cdfc9cb861396ad4da53f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform matrix subtraction on GPU.  <br /></td></tr>
<tr class="separator:ad18a2b0efc0cdfc9cb861396ad4da53f"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a4ca041c74dc55e3ac9124b5fd39b985c" id="r_a4ca041c74dc55e3ac9124b5fd39b985c"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4ca041c74dc55e3ac9124b5fd39b985c">MatrixSub</a> (dim3 gridDim, dim3 blockDim, float *a, float *b, float *c, unsigned long long n, const std::vector&lt; size_t &gt; &amp;offset_c, const std::vector&lt; size_t &gt; &amp;offset_a, const std::vector&lt; size_t &gt; &amp;offset_b)</td></tr>
<tr class="memdesc:a4ca041c74dc55e3ac9124b5fd39b985c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform matrix subtraction on GPU.  <br /></td></tr>
<tr class="separator:a4ca041c74dc55e3ac9124b5fd39b985c"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:ae30a6e1de69588aa0c6eb8a5b8e6e826" id="r_ae30a6e1de69588aa0c6eb8a5b8e6e826"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae30a6e1de69588aa0c6eb8a5b8e6e826">GeneralMatrixMul</a> (dim3 gridDim, dim3 blockDim, float *A, float *B, float *C, unsigned long long M, unsigned long long N, unsigned long long K, size_t offset_c=0, size_t offset_a=0, size_t offset_b=0)</td></tr>
<tr class="memdesc:ae30a6e1de69588aa0c6eb8a5b8e6e826"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform single-precision matrix multiplication on GPU using CUDA cores.  <br /></td></tr>
<tr class="separator:ae30a6e1de69588aa0c6eb8a5b8e6e826"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:aa3720ebf4ae0cc9f4abbd1e32842191b" id="r_aa3720ebf4ae0cc9f4abbd1e32842191b"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa3720ebf4ae0cc9f4abbd1e32842191b">GeneralMatrixMul</a> (dim3 gridDim, dim3 blockDim, float *A, float *B, float *C, unsigned long long M, unsigned long long N, unsigned long long K, const std::vector&lt; size_t &gt; &amp;offset_c, const std::vector&lt; size_t &gt; &amp;offset_a, const std::vector&lt; size_t &gt; &amp;offset_b)</td></tr>
<tr class="memdesc:aa3720ebf4ae0cc9f4abbd1e32842191b"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform single-precision matrix multiplication on GPU using CUDA cores.  <br /></td></tr>
<tr class="separator:aa3720ebf4ae0cc9f4abbd1e32842191b"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:afe3f38f788c735b7eb718443eb0fd094" id="r_afe3f38f788c735b7eb718443eb0fd094"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#afe3f38f788c735b7eb718443eb0fd094">Transpose</a> (dim3 gridDim, dim3 blockDim, float *d_A, float *d_B, unsigned int rows, unsigned int cols, size_t offset=0)</td></tr>
<tr class="memdesc:afe3f38f788c735b7eb718443eb0fd094"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to transpose a matrix on the GPU.  <br /></td></tr>
<tr class="separator:afe3f38f788c735b7eb718443eb0fd094"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a16823e30ad99965b64a03e2d4a91a699" id="r_a16823e30ad99965b64a03e2d4a91a699"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a16823e30ad99965b64a03e2d4a91a699">Transpose</a> (dim3 gridDim, dim3 blockDim, float *d_A, float *d_B, unsigned int rows, unsigned int cols, const std::vector&lt; size_t &gt; &amp;offset)</td></tr>
<tr class="memdesc:a16823e30ad99965b64a03e2d4a91a699"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to transpose a matrix on the GPU.  <br /></td></tr>
<tr class="separator:a16823e30ad99965b64a03e2d4a91a699"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a5af716524e248c61f3dce227d8ef6e34" id="r_a5af716524e248c61f3dce227d8ef6e34"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a5af716524e248c61f3dce227d8ef6e34">ScalarMul</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, float num, unsigned long long n)</td></tr>
<tr class="memdesc:a5af716524e248c61f3dce227d8ef6e34"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform scalar multiplication on the GPU.  <br /></td></tr>
<tr class="separator:a5af716524e248c61f3dce227d8ef6e34"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a27bc4025be4253d5fffae2bf1b43b3af" id="r_a27bc4025be4253d5fffae2bf1b43b3af"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a27bc4025be4253d5fffae2bf1b43b3af">ScalarDiv</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, float num, unsigned long long n)</td></tr>
<tr class="memdesc:a27bc4025be4253d5fffae2bf1b43b3af"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform scalar division on the GPU.  <br /></td></tr>
<tr class="separator:a27bc4025be4253d5fffae2bf1b43b3af"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a56f84e531825be8b2b0974c2488eb765" id="r_a56f84e531825be8b2b0974c2488eb765"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a56f84e531825be8b2b0974c2488eb765">ScalarAdd</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, float num, unsigned long long n)</td></tr>
<tr class="memdesc:a56f84e531825be8b2b0974c2488eb765"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to add a scalar to each element of a matrix on the GPU.  <br /></td></tr>
<tr class="separator:a56f84e531825be8b2b0974c2488eb765"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:af7069a420e81babb49b1bc009333d053" id="r_af7069a420e81babb49b1bc009333d053"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af7069a420e81babb49b1bc009333d053">Negation</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, unsigned long long n)</td></tr>
<tr class="memdesc:af7069a420e81babb49b1bc009333d053"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to negate each element of a matrix on the GPU.  <br /></td></tr>
<tr class="separator:af7069a420e81babb49b1bc009333d053"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:adc047e65307dbc711235f637227b7d10" id="r_adc047e65307dbc711235f637227b7d10"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adc047e65307dbc711235f637227b7d10">Recip</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, unsigned long long n)</td></tr>
<tr class="memdesc:adc047e65307dbc711235f637227b7d10"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the reciprocal of each element of a matrix on the GPU.  <br /></td></tr>
<tr class="separator:adc047e65307dbc711235f637227b7d10"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a8855f411733f7de29d013f4ad40096c9" id="r_a8855f411733f7de29d013f4ad40096c9"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8855f411733f7de29d013f4ad40096c9">RectifiedLinearUnit</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, unsigned long long n)</td></tr>
<tr class="memdesc:a8855f411733f7de29d013f4ad40096c9"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply the Rectified Linear Unit (ReLU) activation on the GPU.  <br /></td></tr>
<tr class="separator:a8855f411733f7de29d013f4ad40096c9"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a4ddfc808de99fe831e74a3bd3f9bbdaf" id="r_a4ddfc808de99fe831e74a3bd3f9bbdaf"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4ddfc808de99fe831e74a3bd3f9bbdaf">ReLUBackward</a> (dim3 gridDim, dim3 blockDim, float *A_grad, float *A, float *B_grad, unsigned long long n)</td></tr>
<tr class="memdesc:a4ddfc808de99fe831e74a3bd3f9bbdaf"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of the ReLU activation during backpropagation.  <br /></td></tr>
<tr class="separator:a4ddfc808de99fe831e74a3bd3f9bbdaf"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a21bbbcf6d97bfaccc828ce7736814bd4" id="r_a21bbbcf6d97bfaccc828ce7736814bd4"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a21bbbcf6d97bfaccc828ce7736814bd4">Sigmoid</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, unsigned long long n)</td></tr>
<tr class="memdesc:a21bbbcf6d97bfaccc828ce7736814bd4"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply the Sigmoid activation function on the GPU.  <br /></td></tr>
<tr class="separator:a21bbbcf6d97bfaccc828ce7736814bd4"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:aff1f9f1bf9fb677024bd2b565fab9801" id="r_aff1f9f1bf9fb677024bd2b565fab9801"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aff1f9f1bf9fb677024bd2b565fab9801">SigmoidBackward</a> (dim3 gridDim, dim3 blockDim, float *A_grad, float *B, float *B_grad, unsigned long long n)</td></tr>
<tr class="memdesc:aff1f9f1bf9fb677024bd2b565fab9801"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of the Sigmoid activation during backpropagation.  <br /></td></tr>
<tr class="separator:aff1f9f1bf9fb677024bd2b565fab9801"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:aeb7d10939b25508e0b5db1fe44f4b467" id="r_aeb7d10939b25508e0b5db1fe44f4b467"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aeb7d10939b25508e0b5db1fe44f4b467">Tanh</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, unsigned long long n)</td></tr>
<tr class="memdesc:aeb7d10939b25508e0b5db1fe44f4b467"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply the Tanh activation function on the GPU.  <br /></td></tr>
<tr class="separator:aeb7d10939b25508e0b5db1fe44f4b467"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a90d501e72361b7341f36394af0f27c74" id="r_a90d501e72361b7341f36394af0f27c74"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a90d501e72361b7341f36394af0f27c74">TanhBackward</a> (dim3 gridDim, dim3 blockDim, float *A_grad, float *B, float *B_grad, unsigned long long n)</td></tr>
<tr class="memdesc:a90d501e72361b7341f36394af0f27c74"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of the Tanh activation during backpropagation.  <br /></td></tr>
<tr class="separator:a90d501e72361b7341f36394af0f27c74"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a04246c5218530f789a0ed4811b7ef3f3" id="r_a04246c5218530f789a0ed4811b7ef3f3"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a04246c5218530f789a0ed4811b7ef3f3">LeakyReLU</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, unsigned long long n, float alpha=0.01f)</td></tr>
<tr class="memdesc:a04246c5218530f789a0ed4811b7ef3f3"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply the Leaky ReLU activation function on the GPU.  <br /></td></tr>
<tr class="separator:a04246c5218530f789a0ed4811b7ef3f3"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a7eade95ddcf48141d69bb19803b22d51" id="r_a7eade95ddcf48141d69bb19803b22d51"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7eade95ddcf48141d69bb19803b22d51">LeakyReLUBackward</a> (dim3 gridDim, dim3 blockDim, float *A_grad, float *A, float *B_grad, unsigned long long n, float alpha=0.01f)</td></tr>
<tr class="memdesc:a7eade95ddcf48141d69bb19803b22d51"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of the Leaky ReLU activation during backpropagation.  <br /></td></tr>
<tr class="separator:a7eade95ddcf48141d69bb19803b22d51"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a997aa5460fd64fadf9b701fbf73e3fb2" id="r_a997aa5460fd64fadf9b701fbf73e3fb2"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a997aa5460fd64fadf9b701fbf73e3fb2">Swish</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, unsigned long long n)</td></tr>
<tr class="memdesc:a997aa5460fd64fadf9b701fbf73e3fb2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply the Swish activation function on the GPU.  <br /></td></tr>
<tr class="separator:a997aa5460fd64fadf9b701fbf73e3fb2"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a6c5a4b54442aab42df5afe8688e71596" id="r_a6c5a4b54442aab42df5afe8688e71596"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a6c5a4b54442aab42df5afe8688e71596">SwishBackward</a> (dim3 gridDim, dim3 blockDim, float *A_grad, float *A, float *B, float *B_grad, unsigned long long n)</td></tr>
<tr class="memdesc:a6c5a4b54442aab42df5afe8688e71596"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of the Swish activation during backpropagation.  <br /></td></tr>
<tr class="separator:a6c5a4b54442aab42df5afe8688e71596"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a0e82aca250b46ac8ded8cae8936d7e38" id="r_a0e82aca250b46ac8ded8cae8936d7e38"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0e82aca250b46ac8ded8cae8936d7e38">ExponentialLinearUnit</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, unsigned long long n, float alpha=1.0f)</td></tr>
<tr class="memdesc:a0e82aca250b46ac8ded8cae8936d7e38"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply the Exponential Linear Unit (ELU) activation function on the GPU.  <br /></td></tr>
<tr class="separator:a0e82aca250b46ac8ded8cae8936d7e38"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:aee8ca471aa260bd1fca5b1797e229f9f" id="r_aee8ca471aa260bd1fca5b1797e229f9f"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aee8ca471aa260bd1fca5b1797e229f9f">ELUBackward</a> (dim3 gridDim, dim3 blockDim, float *A_grad, float *A, float *B_grad, unsigned long long n, float alpha=1.0f)</td></tr>
<tr class="memdesc:aee8ca471aa260bd1fca5b1797e229f9f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of the ELU activation during backpropagation.  <br /></td></tr>
<tr class="separator:aee8ca471aa260bd1fca5b1797e229f9f"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a52e449285e560185378234aecaf2f87c" id="r_a52e449285e560185378234aecaf2f87c"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a52e449285e560185378234aecaf2f87c">HardSigmoid</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, unsigned long long n, float alpha=0.2f, float beta=0.5f)</td></tr>
<tr class="memdesc:a52e449285e560185378234aecaf2f87c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply the Hard Sigmoid activation function on the GPU.  <br /></td></tr>
<tr class="separator:a52e449285e560185378234aecaf2f87c"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a43232f9472ad3b974351e59386208efa" id="r_a43232f9472ad3b974351e59386208efa"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a43232f9472ad3b974351e59386208efa">HardSigmoidBackward</a> (dim3 gridDim, dim3 blockDim, float *A_grad, float *A, float *B_grad, unsigned long long n, float alpha=0.2f, float beta=0.5f)</td></tr>
<tr class="memdesc:a43232f9472ad3b974351e59386208efa"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of the Hard Sigmoid activation during backpropagation.  <br /></td></tr>
<tr class="separator:a43232f9472ad3b974351e59386208efa"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:aef9c028ed356b5684e103639bb23bcf0" id="r_aef9c028ed356b5684e103639bb23bcf0"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aef9c028ed356b5684e103639bb23bcf0">HardSwish</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, unsigned long long n, float alpha=0.2f, float beta=0.5f)</td></tr>
<tr class="memdesc:aef9c028ed356b5684e103639bb23bcf0"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply the Hard Swish activation function on the GPU.  <br /></td></tr>
<tr class="separator:aef9c028ed356b5684e103639bb23bcf0"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a455365870d43ff26687a731d15c4cdff" id="r_a455365870d43ff26687a731d15c4cdff"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a455365870d43ff26687a731d15c4cdff">HardSwishBackward</a> (dim3 gridDim, dim3 blockDim, float *A_grad, float *A, float *B_grad, unsigned long long n, float alpha=0.2f, float beta=0.5f)</td></tr>
<tr class="memdesc:a455365870d43ff26687a731d15c4cdff"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of the Hard Swish activation during backpropagation.  <br /></td></tr>
<tr class="separator:a455365870d43ff26687a731d15c4cdff"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a51a5ff3c8cc2c3051fddf32de294b467" id="r_a51a5ff3c8cc2c3051fddf32de294b467"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a51a5ff3c8cc2c3051fddf32de294b467">SummationExp</a> (dim3 gridDim, dim3 blockDim, size_t sharedMemSize, float *out, float *g_data, unsigned long long n, size_t offset=0)</td></tr>
<tr class="memdesc:a51a5ff3c8cc2c3051fddf32de294b467"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the summation of exponentials of each element in the input array.  <br /></td></tr>
<tr class="separator:a51a5ff3c8cc2c3051fddf32de294b467"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:adbafc409d57fa0a9d78ecac5bf7b10a3" id="r_adbafc409d57fa0a9d78ecac5bf7b10a3"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#adbafc409d57fa0a9d78ecac5bf7b10a3">Softmax</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, float exp_sum_of_input, unsigned long long n, size_t offset=0)</td></tr>
<tr class="memdesc:adbafc409d57fa0a9d78ecac5bf7b10a3"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply the Softmax function on the GPU.  <br /></td></tr>
<tr class="separator:adbafc409d57fa0a9d78ecac5bf7b10a3"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a4375738c83ef892783abc210578e5b39" id="r_a4375738c83ef892783abc210578e5b39"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a4375738c83ef892783abc210578e5b39">SoftmaxJacobian</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, unsigned long long n)</td></tr>
<tr class="memdesc:a4375738c83ef892783abc210578e5b39"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the Jacobian of the Softmax function.  <br /></td></tr>
<tr class="separator:a4375738c83ef892783abc210578e5b39"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:af76ce6a930db4def5ceb51350af72f3c" id="r_af76ce6a930db4def5ceb51350af72f3c"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#af76ce6a930db4def5ceb51350af72f3c">MeanSquaredError</a> (dim3 gridDim, dim3 blockDim, size_t sharedMemSize, float *out, float *predict, float *real, unsigned long long n)</td></tr>
<tr class="memdesc:af76ce6a930db4def5ceb51350af72f3c"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the Mean Squared Error (MSE) loss between predicted and real values.  <br /></td></tr>
<tr class="separator:af76ce6a930db4def5ceb51350af72f3c"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:ae77920db6adf79a17dbfb1dbf1ab5656" id="r_ae77920db6adf79a17dbfb1dbf1ab5656"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae77920db6adf79a17dbfb1dbf1ab5656">MSEBackward</a> (dim3 gridDim, dim3 blockDim, float *out, float *predict, float *real, unsigned long long n)</td></tr>
<tr class="memdesc:ae77920db6adf79a17dbfb1dbf1ab5656"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of the Mean Squared Error (MSE) loss for backpropagation.  <br /></td></tr>
<tr class="separator:ae77920db6adf79a17dbfb1dbf1ab5656"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:aeec286d5351eee7061e151470adb4eef" id="r_aeec286d5351eee7061e151470adb4eef"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aeec286d5351eee7061e151470adb4eef">StochasticGradientDescent</a> (dim3 gridDim, dim3 blockDim, float *data, float *grad, float lr, unsigned long long n)</td></tr>
<tr class="memdesc:aeec286d5351eee7061e151470adb4eef"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform Stochastic Gradient Descent (SGD) optimization.  <br /></td></tr>
<tr class="separator:aeec286d5351eee7061e151470adb4eef"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:abf927faf0950fbc215564c67b8ac57be" id="r_abf927faf0950fbc215564c67b8ac57be"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abf927faf0950fbc215564c67b8ac57be">BinaryCrossEntropy</a> (dim3 gridDim, dim3 blockDim, size_t sharedMemSize, float *out, float *predict, float *real, unsigned long long n)</td></tr>
<tr class="memdesc:abf927faf0950fbc215564c67b8ac57be"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the Binary Cross Entropy (BCE) loss between predicted and real values.  <br /></td></tr>
<tr class="separator:abf927faf0950fbc215564c67b8ac57be"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a1fc3d553947a5cad87f29989f9d9465d" id="r_a1fc3d553947a5cad87f29989f9d9465d"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1fc3d553947a5cad87f29989f9d9465d">BCEBackward</a> (dim3 gridDim, dim3 blockDim, float *out, float *predict, float *real, unsigned long long n)</td></tr>
<tr class="memdesc:a1fc3d553947a5cad87f29989f9d9465d"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of Binary Cross Entropy (BCE) loss for backpropagation.  <br /></td></tr>
<tr class="separator:a1fc3d553947a5cad87f29989f9d9465d"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a273ef3023442a864f1028becaf236bae" id="r_a273ef3023442a864f1028becaf236bae"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a273ef3023442a864f1028becaf236bae">Momentum</a> (dim3 gridDim, dim3 blockDim, float *output, float *grad, float *velocity, float beta, unsigned long long n)</td></tr>
<tr class="memdesc:a273ef3023442a864f1028becaf236bae"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply Momentum optimization.  <br /></td></tr>
<tr class="separator:a273ef3023442a864f1028becaf236bae"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a1e915bd4a354938d8bc2d09be00eae76" id="r_a1e915bd4a354938d8bc2d09be00eae76"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1e915bd4a354938d8bc2d09be00eae76">AdaGrad</a> (dim3 gridDim, dim3 blockDim, float *data, float *G, float *grad, float lr, float eps, unsigned long long n)</td></tr>
<tr class="memdesc:a1e915bd4a354938d8bc2d09be00eae76"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply AdaGrad optimization.  <br /></td></tr>
<tr class="separator:a1e915bd4a354938d8bc2d09be00eae76"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:aaf3c9cca114d003130ffa4354b4a24de" id="r_aaf3c9cca114d003130ffa4354b4a24de"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aaf3c9cca114d003130ffa4354b4a24de">RMSprop</a> (dim3 gridDim, dim3 blockDim, float *data, float *v, float *grad, float lr, float beta, float eps, unsigned long long n)</td></tr>
<tr class="memdesc:aaf3c9cca114d003130ffa4354b4a24de"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply RMSprop optimization.  <br /></td></tr>
<tr class="separator:aaf3c9cca114d003130ffa4354b4a24de"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a2b9ab840eeb0e74f4b78277a046b3a07" id="r_a2b9ab840eeb0e74f4b78277a046b3a07"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a2b9ab840eeb0e74f4b78277a046b3a07">Adam</a> (dim3 gridDim, dim3 blockDim, float *data, float *m, float *v, float *grad, float lr, float beta1, float beta2, float eps, int t, unsigned long long n)</td></tr>
<tr class="memdesc:a2b9ab840eeb0e74f4b78277a046b3a07"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply Adam optimization.  <br /></td></tr>
<tr class="separator:a2b9ab840eeb0e74f4b78277a046b3a07"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:ada94b8c5c6e6d72132face63a3305624" id="r_ada94b8c5c6e6d72132face63a3305624"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ada94b8c5c6e6d72132face63a3305624">NAdam</a> (dim3 gridDim, dim3 blockDim, float *data, float *m, float *m_modified, float *v, float *grad, float lr, float beta1, float beta2, float eps, int t, unsigned long long n)</td></tr>
<tr class="memdesc:ada94b8c5c6e6d72132face63a3305624"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply NAdam optimization.  <br /></td></tr>
<tr class="separator:ada94b8c5c6e6d72132face63a3305624"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a1f71726879c2d6a9d790522cdc1576e1" id="r_a1f71726879c2d6a9d790522cdc1576e1"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1f71726879c2d6a9d790522cdc1576e1">AdaDelta</a> (dim3 gridDim, dim3 blockDim, float *data, float *acc_delta, float *acc_grad, float *grad, float rho, float eps, unsigned long long n)</td></tr>
<tr class="memdesc:a1f71726879c2d6a9d790522cdc1576e1"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to apply AdaDelta optimization.  <br /></td></tr>
<tr class="separator:a1f71726879c2d6a9d790522cdc1576e1"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:aa84aa2397f4f5a09a96bef76726e46f0" id="r_aa84aa2397f4f5a09a96bef76726e46f0"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa84aa2397f4f5a09a96bef76726e46f0">TensorCoreGEMM</a> (float *A, float *B, float *C, unsigned long long M, unsigned long long N, unsigned long long K)</td></tr>
<tr class="memdesc:aa84aa2397f4f5a09a96bef76726e46f0"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform fast matrix multiplication using Tensor Cores with half-precision (FP16) support.  <br /></td></tr>
<tr class="separator:aa84aa2397f4f5a09a96bef76726e46f0"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:ad136c8a6560a5305984ce0a31bea71bf" id="r_ad136c8a6560a5305984ce0a31bea71bf"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ad136c8a6560a5305984ce0a31bea71bf">Fill</a> (dim3 gridDim, dim3 blockDim, float *data, float value, unsigned long long n, size_t offset=0)</td></tr>
<tr class="memdesc:ad136c8a6560a5305984ce0a31bea71bf"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to fill a data array with a given value.  <br /></td></tr>
<tr class="separator:ad136c8a6560a5305984ce0a31bea71bf"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a8ec4524fdefd3d771c72e77e94281c88" id="r_a8ec4524fdefd3d771c72e77e94281c88"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a8ec4524fdefd3d771c72e77e94281c88">HadamardProduct</a> (dim3 gridDim, dim3 blockDim, float *out, float *in1, float *in2, unsigned long long n)</td></tr>
<tr class="memdesc:a8ec4524fdefd3d771c72e77e94281c88"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform element-wise Hadamard product of two arrays.  <br /></td></tr>
<tr class="separator:a8ec4524fdefd3d771c72e77e94281c88"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:aa61cded4977bb2dc3720f7057cc2fb47" id="r_aa61cded4977bb2dc3720f7057cc2fb47"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#aa61cded4977bb2dc3720f7057cc2fb47">ElementwiseDivide</a> (dim3 gridDim, dim3 blockDim, float *out, float *in1, float *in2, unsigned long long n, size_t offset_o=0, size_t offset_1=0, size_t offset_2=0)</td></tr>
<tr class="memdesc:aa61cded4977bb2dc3720f7057cc2fb47"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform element-wise division of two arrays.  <br /></td></tr>
<tr class="separator:aa61cded4977bb2dc3720f7057cc2fb47"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a1ae846a65c2f5b83cd1b9fc61b877854" id="r_a1ae846a65c2f5b83cd1b9fc61b877854"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1ae846a65c2f5b83cd1b9fc61b877854">Summation</a> (dim3 gridDim, dim3 blockDim, unsigned long long sharedMemSize, float *out, float *in, unsigned long long n, size_t offset=0)</td></tr>
<tr class="memdesc:a1ae846a65c2f5b83cd1b9fc61b877854"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform element-wise summation of two arrays.  <br /></td></tr>
<tr class="separator:a1ae846a65c2f5b83cd1b9fc61b877854"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a0ed44a68bfb86a9fd3d6c3b25614713f" id="r_a0ed44a68bfb86a9fd3d6c3b25614713f"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0ed44a68bfb86a9fd3d6c3b25614713f">gradCopy</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, size_t n, const std::vector&lt; size_t &gt; &amp;offset_o, const std::vector&lt; size_t &gt; &amp;offset_i)</td></tr>
<tr class="memdesc:a0ed44a68bfb86a9fd3d6c3b25614713f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Copies gradient data from one array to another with specified offsets.  <br /></td></tr>
<tr class="separator:a0ed44a68bfb86a9fd3d6c3b25614713f"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a9ac0590fbb5eb7f51b05da574e9845a8" id="r_a9ac0590fbb5eb7f51b05da574e9845a8"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a9ac0590fbb5eb7f51b05da574e9845a8">NgradCopy</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, size_t n, const std::vector&lt; size_t &gt; &amp;offset_o, const std::vector&lt; size_t &gt; &amp;offset_i)</td></tr>
<tr class="memdesc:a9ac0590fbb5eb7f51b05da574e9845a8"><td class="mdescLeft">&#160;</td><td class="mdescRight">Copies gradient data from one array to another with specified offsets.  <br /></td></tr>
<tr class="separator:a9ac0590fbb5eb7f51b05da574e9845a8"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:ae45dbebceb76ddf82fa5e6b9df882e62" id="r_ae45dbebceb76ddf82fa5e6b9df882e62"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#ae45dbebceb76ddf82fa5e6b9df882e62">Expand</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, size_t n, size_t total)</td></tr>
<tr class="memdesc:ae45dbebceb76ddf82fa5e6b9df882e62"><td class="mdescLeft">&#160;</td><td class="mdescRight">Expands the input array into the output array with a specified total size.  <br /></td></tr>
<tr class="separator:ae45dbebceb76ddf82fa5e6b9df882e62"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a454a28ef0e22014efca1ede4e954db65" id="r_a454a28ef0e22014efca1ede4e954db65"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a454a28ef0e22014efca1ede4e954db65">Compress</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, size_t n, size_t total)</td></tr>
<tr class="memdesc:a454a28ef0e22014efca1ede4e954db65"><td class="mdescLeft">&#160;</td><td class="mdescRight">Compresses the input array into the output array with a specified total size.  <br /></td></tr>
<tr class="separator:a454a28ef0e22014efca1ede4e954db65"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a3a781324400c54c35dd564f3599dca8e" id="r_a3a781324400c54c35dd564f3599dca8e"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a3a781324400c54c35dd564f3599dca8e">img2col</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, size_t H_out, size_t W_out, size_t C, size_t K_h, size_t K_w, size_t stride, size_t pad, size_t H_in, size_t W_in, size_t batch)</td></tr>
<tr class="memdesc:a3a781324400c54c35dd564f3599dca8e"><td class="mdescLeft">&#160;</td><td class="mdescRight">Rearranges image data into column format for convolution operations.  <br /></td></tr>
<tr class="separator:a3a781324400c54c35dd564f3599dca8e"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a1c2b7a6f28d2af22f9a2623c5ae62bff" id="r_a1c2b7a6f28d2af22f9a2623c5ae62bff"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a1c2b7a6f28d2af22f9a2623c5ae62bff">img2colBackward</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, size_t H_out, size_t W_out, size_t C, size_t K_h, size_t K_w, size_t stride, size_t pad, size_t H_in, size_t W_in, size_t batch)</td></tr>
<tr class="memdesc:a1c2b7a6f28d2af22f9a2623c5ae62bff"><td class="mdescLeft">&#160;</td><td class="mdescRight">Rearranges columnar data back into image format for backpropagation in convolution operations.  <br /></td></tr>
<tr class="separator:a1c2b7a6f28d2af22f9a2623c5ae62bff"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a7c061f5511c3ab9d36563757bd969ff7" id="r_a7c061f5511c3ab9d36563757bd969ff7"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a7c061f5511c3ab9d36563757bd969ff7">col2img</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, size_t H_out, size_t W_out, size_t C_out, size_t batches)</td></tr>
<tr class="memdesc:a7c061f5511c3ab9d36563757bd969ff7"><td class="mdescLeft">&#160;</td><td class="mdescRight">Rearranges columnar data back into image format.  <br /></td></tr>
<tr class="separator:a7c061f5511c3ab9d36563757bd969ff7"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a028970809074d79f28ff94f62b3edaa4" id="r_a028970809074d79f28ff94f62b3edaa4"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a028970809074d79f28ff94f62b3edaa4">col2imgBackward</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, size_t H_out, size_t W_out, size_t C_out, size_t batches)</td></tr>
<tr class="memdesc:a028970809074d79f28ff94f62b3edaa4"><td class="mdescLeft">&#160;</td><td class="mdescRight">Rearranges columnar data back into image format for backpropagation.  <br /></td></tr>
<tr class="separator:a028970809074d79f28ff94f62b3edaa4"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:addaa377a94d007df2690043b08904e28" id="r_addaa377a94d007df2690043b08904e28"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#addaa377a94d007df2690043b08904e28">AveragePooling</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, size_t pool_size, size_t stride, size_t padding, size_t batches, size_t channels, size_t H_in, size_t W_in, size_t H_out, size_t W_out)</td></tr>
<tr class="memdesc:addaa377a94d007df2690043b08904e28"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform average pooling on the GPU.  <br /></td></tr>
<tr class="separator:addaa377a94d007df2690043b08904e28"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a551402f9c55653c9fae63e172a5fb250" id="r_a551402f9c55653c9fae63e172a5fb250"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a551402f9c55653c9fae63e172a5fb250">AveragePoolingBackward</a> (dim3 gridDim, dim3 blockDim, float *out, float *in, size_t pool_size, size_t stride, size_t padding, size_t batches, size_t channels, size_t H_in, size_t W_in, size_t H_out, size_t W_out)</td></tr>
<tr class="memdesc:a551402f9c55653c9fae63e172a5fb250"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of average pooling during backpropagation.  <br /></td></tr>
<tr class="separator:a551402f9c55653c9fae63e172a5fb250"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a73ceb77688c4008dc350fc87b99875aa" id="r_a73ceb77688c4008dc350fc87b99875aa"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a73ceb77688c4008dc350fc87b99875aa">GlobalAvgPoolBackward</a> (dim3 gridDim, dim3 blockDim, float *output, float *in, size_t batches, size_t channels, size_t height, size_t width)</td></tr>
<tr class="memdesc:a73ceb77688c4008dc350fc87b99875aa"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of global average pooling during backpropagation.  <br /></td></tr>
<tr class="separator:a73ceb77688c4008dc350fc87b99875aa"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:abcc632e5a7104c1a28208e94a4ce6e28" id="r_abcc632e5a7104c1a28208e94a4ce6e28"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#abcc632e5a7104c1a28208e94a4ce6e28">MaxPooling</a> (dim3 gridDim, dim3 blockDim, float *output, float *position, float *input, size_t pool_size, size_t stride, size_t padding, size_t batches, size_t channels, size_t H_in, size_t W_in, size_t H_out, size_t W_out)</td></tr>
<tr class="memdesc:abcc632e5a7104c1a28208e94a4ce6e28"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to perform max pooling on the GPU.  <br /></td></tr>
<tr class="separator:abcc632e5a7104c1a28208e94a4ce6e28"><td class="memSeparator" colspan="2">&#160;</td></tr>
<tr class="memitem:a0d5f5f4c9e89a8d914a7f2f802d1caab" id="r_a0d5f5f4c9e89a8d914a7f2f802d1caab"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="#a0d5f5f4c9e89a8d914a7f2f802d1caab">MaxPoolingBackward</a> (dim3 gridDim, dim3 blockDim, float *output, float *position, float *input, size_t pool_size, size_t stride, size_t padding, size_t batches, size_t channels, size_t H_in, size_t W_in, size_t H_out, size_t W_out)</td></tr>
<tr class="memdesc:a0d5f5f4c9e89a8d914a7f2f802d1caab"><td class="mdescLeft">&#160;</td><td class="mdescRight">Kernel function to compute the gradient of max pooling during backpropagation.  <br /></td></tr>
<tr class="separator:a0d5f5f4c9e89a8d914a7f2f802d1caab"><td class="memSeparator" colspan="2">&#160;</td></tr>
</table>
<a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
<div class="textblock"><p>High-Performance CUDA Kernel Implementations for Tensor Computations. </p>
<p>The <a class="el" href="namespacenz_1_1krnl.html" title="High-Performance CUDA Kernel Implementations for Tensor Computations.">nz::krnl</a> namespace provides an extensive collection of CUDA kernel functions optimized for accelerated tensor operations and deep learning computations.</p>
<h1><a class="anchor" id="kernel_categories"></a>
Kernel Function Categories</h1>
<p>The namespace encompasses several critical categories of computational kernels:</p>
<h2><a class="anchor" id="matrix_ops"></a>
Matrix Operations</h2>
<ul>
<li>Matrix addition, subtraction</li>
<li>General matrix multiplication</li>
<li>Matrix transposition</li>
</ul>
<h2><a class="anchor" id="scalar_ops"></a>
Scalar Operations</h2>
<ul>
<li>Element-wise scalar multiplication</li>
<li>Element-wise scalar division</li>
<li>Element-wise scalar addition</li>
<li>Negation</li>
<li>Reciprocal calculations</li>
</ul>
<h2><a class="anchor" id="activation_funcs"></a>
Activation Functions</h2>
<p>Linear Activations:</p><ul>
<li>ReLU (Rectified Linear Unit)</li>
<li>Leaky ReLU</li>
</ul>
<p>Non-linear Activations:</p><ul>
<li>Sigmoid</li>
<li>Hard Sigmoid</li>
<li>Tanh</li>
<li>Swish</li>
<li>Exponential Linear Unit (ELU)</li>
<li>Hard Swish</li>
</ul>
<h2><a class="anchor" id="backward_props"></a>
Backward Propagation Kernels</h2>
<p>Gradient computation kernels for each activation function, supporting efficient backpropagation in neural network training.</p>
<h2><a class="anchor" id="loss_funcs"></a>
Loss Functions</h2>
<ul>
<li>Mean Squared Error (MSE)</li>
<li>Binary Cross-Entropy (BCE)</li>
</ul>
<h2><a class="anchor" id="optimization_algos"></a>
Optimization Algorithms</h2>
<ul>
<li>Stochastic Gradient Descent (SGD)</li>
<li>Momentum</li>
<li>AdaGrad</li>
<li>RMSprop</li>
<li>Adam</li>
<li>NAdam</li>
<li>AdaDelta</li>
</ul>
<dl class="section note"><dt>Note</dt><dd>Performance Characteristics<ul>
<li>Designed for parallel execution on CUDA-enabled GPUs</li>
<li>Utilizes <code>unsigned long long</code> for supporting large tensor dimensions</li>
<li>Operates on raw float pointers for maximum performance and flexibility</li>
</ul>
</dd></dl>
<dl class="section warning"><dt>Warning</dt><dd>These low-level CUDA kernels are intended for internal library implementation. End-users should NOT directly invoke these kernels.</dd></dl>
<dl class="section see"><dt>See also</dt><dd><a class="el" href="_operation_kernels_8cuh.html" title="CUDA Kernel Definitions for High-Performance Tensor Operations.">OperationKernels.cuh</a></dd></dl>
<dl class="section author"><dt>Author</dt><dd>Mgepahmge </dd></dl>
<dl class="section date"><dt>Date</dt><dd>2024/12/07 </dd></dl>
</div><h2 class="groupheader">Function Documentation</h2>
<a id="a1f71726879c2d6a9d790522cdc1576e1" name="a1f71726879c2d6a9d790522cdc1576e1"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a1f71726879c2d6a9d790522cdc1576e1">&#9670;&#160;</a></span>AdaDelta()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::AdaDelta </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>acc_delta</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>acc_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>rho</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>eps</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply AdaDelta optimization. </p>
<p>This function updates the data array using AdaDelta optimization, which uses a moving average of squared gradients and deltas to adaptively adjust the learning rate.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">data</td><td>Pointer to the data array that will be updated </td></tr>
    <tr><td class="paramname">acc_delta</td><td>Pointer to the accumulated delta values </td></tr>
    <tr><td class="paramname">acc_grad</td><td>Pointer to the accumulated gradient squared values </td></tr>
    <tr><td class="paramname">grad</td><td>Pointer to the gradient array </td></tr>
    <tr><td class="paramname">rho</td><td>The decay rate for the moving averages (typically between 0.9 and 0.95) </td></tr>
    <tr><td class="paramname">eps</td><td>A small constant to avoid division by zero (default 1e-8) </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the data, gradient, and accumulated values arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00815">815</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a1f71726879c2d6a9d790522cdc1576e1_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a1f71726879c2d6a9d790522cdc1576e1_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a1f71726879c2d6a9d790522cdc1576e1_cgraph" id="anamespacenz_1_1krnl_a1f71726879c2d6a9d790522cdc1576e1_cgraph">
<area shape="rect" title="Kernel function to apply AdaDelta optimization." alt="" coords="5,13,132,40"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="180,5,365,48"/>
<area shape="poly" title=" " alt="" coords="132,24,164,24,164,29,132,29"/>
</map>
</div>

</div>
</div>
<a id="a1e915bd4a354938d8bc2d09be00eae76" name="a1e915bd4a354938d8bc2d09be00eae76"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a1e915bd4a354938d8bc2d09be00eae76">&#9670;&#160;</a></span>AdaGrad()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::AdaGrad </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>G</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>lr</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>eps</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply AdaGrad optimization. </p>
<p>This function updates the data array using AdaGrad optimization, adjusting the learning rate for each parameter based on the historical gradient squared values.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">data</td><td>Pointer to the data array that will be updated </td></tr>
    <tr><td class="paramname">G</td><td>Pointer to the array of accumulated squared gradients </td></tr>
    <tr><td class="paramname">grad</td><td>Pointer to the gradient array </td></tr>
    <tr><td class="paramname">lr</td><td>The learning rate used for the gradient update </td></tr>
    <tr><td class="paramname">eps</td><td>A small constant to avoid division by zero (default 1e-8) </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the data, gradient, and accumulated gradient arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00731">731</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a1e915bd4a354938d8bc2d09be00eae76_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a1e915bd4a354938d8bc2d09be00eae76_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a1e915bd4a354938d8bc2d09be00eae76_cgraph" id="anamespacenz_1_1krnl_a1e915bd4a354938d8bc2d09be00eae76_cgraph">
<area shape="rect" title="Kernel function to apply AdaGrad optimization." alt="" coords="5,13,129,40"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="177,5,362,48"/>
<area shape="poly" title=" " alt="" coords="129,24,161,24,161,29,129,29"/>
</map>
</div>

</div>
</div>
<a id="a2b9ab840eeb0e74f4b78277a046b3a07" name="a2b9ab840eeb0e74f4b78277a046b3a07"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a2b9ab840eeb0e74f4b78277a046b3a07">&#9670;&#160;</a></span>Adam()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Adam </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>m</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>v</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>lr</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>beta1</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>beta2</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>eps</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>t</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply Adam optimization. </p>
<p>This function updates the data array using Adam optimization, which combines momentum and RMSprop to adaptively adjust the learning rates of each parameter.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">data</td><td>Pointer to the data array that will be updated </td></tr>
    <tr><td class="paramname">m</td><td>Pointer to the first moment estimate (mean of gradients) </td></tr>
    <tr><td class="paramname">v</td><td>Pointer to the second moment estimate (variance of gradients) </td></tr>
    <tr><td class="paramname">grad</td><td>Pointer to the gradient array </td></tr>
    <tr><td class="paramname">lr</td><td>The learning rate used for the gradient update </td></tr>
    <tr><td class="paramname">beta1</td><td>The exponential decay rate for the first moment estimate (default 0.9) </td></tr>
    <tr><td class="paramname">beta2</td><td>The exponential decay rate for the second moment estimate (default 0.999) </td></tr>
    <tr><td class="paramname">eps</td><td>A small constant to avoid division by zero (default 1e-8) </td></tr>
    <tr><td class="paramname">t</td><td>The current time step or iteration </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the data, gradient, and moment arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00768">768</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a2b9ab840eeb0e74f4b78277a046b3a07_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a2b9ab840eeb0e74f4b78277a046b3a07_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a2b9ab840eeb0e74f4b78277a046b3a07_cgraph" id="anamespacenz_1_1krnl_a2b9ab840eeb0e74f4b78277a046b3a07_cgraph">
<area shape="rect" title="Kernel function to apply Adam optimization." alt="" coords="5,13,112,40"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="160,5,345,48"/>
<area shape="poly" title=" " alt="" coords="112,24,144,24,144,29,112,29"/>
</map>
</div>

</div>
</div>
<a id="addaa377a94d007df2690043b08904e28" name="addaa377a94d007df2690043b08904e28"></a>
<h2 class="memtitle"><span class="permalink"><a href="#addaa377a94d007df2690043b08904e28">&#9670;&#160;</a></span>AveragePooling()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::AveragePooling </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>pool_size</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>stride</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>padding</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>batches</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>channels</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_out</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform average pooling on the GPU. </p>
<p>This function applies average pooling to the input tensor, reducing its spatial dimensions by computing the average value within each pooling window.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the pooled results will be stored. </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array containing the original data. </td></tr>
    <tr><td class="paramname">pool_size</td><td>The size of the pooling window. </td></tr>
    <tr><td class="paramname">stride</td><td>The stride of the pooling operation. </td></tr>
    <tr><td class="paramname">padding</td><td>The padding applied to the input tensor. </td></tr>
    <tr><td class="paramname">batches</td><td>The number of batches in the input tensor. </td></tr>
    <tr><td class="paramname">channels</td><td>The number of channels in the input tensor. </td></tr>
    <tr><td class="paramname">H_in</td><td>The height of the input tensor. </td></tr>
    <tr><td class="paramname">W_in</td><td>The width of the input tensor. </td></tr>
    <tr><td class="paramname">H_out</td><td>The height of the output tensor. </td></tr>
    <tr><td class="paramname">W_out</td><td>The width of the output tensor. </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01431">1431</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_addaa377a94d007df2690043b08904e28_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_addaa377a94d007df2690043b08904e28_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_addaa377a94d007df2690043b08904e28_cgraph" id="anamespacenz_1_1krnl_addaa377a94d007df2690043b08904e28_cgraph">
<area shape="rect" title="Kernel function to perform average pooling on the GPU." alt="" coords="5,47,169,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="217,5,402,48"/>
<area shape="poly" title=" " alt="" coords="168,45,201,40,202,45,169,50"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="217,72,402,115"/>
<area shape="poly" title=" " alt="" coords="169,70,202,75,201,80,168,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="450,39,635,81"/>
<area shape="poly" title=" " alt="" coords="402,77,434,73,435,78,403,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="450,105,635,148"/>
<area shape="poly" title=" " alt="" coords="403,104,435,109,434,114,402,109"/>
</map>
</div>

</div>
</div>
<a id="a551402f9c55653c9fae63e172a5fb250" name="a551402f9c55653c9fae63e172a5fb250"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a551402f9c55653c9fae63e172a5fb250">&#9670;&#160;</a></span>AveragePoolingBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::AveragePoolingBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>pool_size</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>stride</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>padding</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>batches</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>channels</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_out</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of average pooling during backpropagation. </p>
<p>This function computes the gradient of the average pooling operation, distributing the gradient values evenly across the pooling window.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the gradient will be stored. </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array containing the gradient from the next layer. </td></tr>
    <tr><td class="paramname">pool_size</td><td>The size of the pooling window. </td></tr>
    <tr><td class="paramname">stride</td><td>The stride of the pooling operation. </td></tr>
    <tr><td class="paramname">padding</td><td>The padding applied to the input tensor. </td></tr>
    <tr><td class="paramname">batches</td><td>The number of batches in the input tensor. </td></tr>
    <tr><td class="paramname">channels</td><td>The number of channels in the input tensor. </td></tr>
    <tr><td class="paramname">H_in</td><td>The height of the input tensor. </td></tr>
    <tr><td class="paramname">W_in</td><td>The width of the input tensor. </td></tr>
    <tr><td class="paramname">H_out</td><td>The height of the output tensor. </td></tr>
    <tr><td class="paramname">W_out</td><td>The width of the output tensor. </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01484">1484</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a551402f9c55653c9fae63e172a5fb250_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a551402f9c55653c9fae63e172a5fb250_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a551402f9c55653c9fae63e172a5fb250_cgraph" id="anamespacenz_1_1krnl_a551402f9c55653c9fae63e172a5fb250_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of average pooling during backpropagation." alt="" coords="5,39,169,81"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="217,5,402,48"/>
<area shape="poly" title=" " alt="" coords="168,45,201,40,202,45,169,50"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="217,72,402,115"/>
<area shape="poly" title=" " alt="" coords="169,70,202,75,201,80,168,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="450,39,635,81"/>
<area shape="poly" title=" " alt="" coords="402,77,434,73,435,78,403,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="450,105,635,148"/>
<area shape="poly" title=" " alt="" coords="403,104,435,109,434,114,402,109"/>
</map>
</div>

</div>
</div>
<a id="a1fc3d553947a5cad87f29989f9d9465d" name="a1fc3d553947a5cad87f29989f9d9465d"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a1fc3d553947a5cad87f29989f9d9465d">&#9670;&#160;</a></span>BCEBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::BCEBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>predict</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>real</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of Binary Cross Entropy (BCE) loss for backpropagation. </p>
<p>This function computes the gradient of the Binary Cross Entropy loss between the predicted and real values for each element in the input arrays and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the BCE gradient will be stored </td></tr>
    <tr><td class="paramname">predict</td><td>Pointer to the predicted values </td></tr>
    <tr><td class="paramname">real</td><td>Pointer to the real values </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00701">701</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a1fc3d553947a5cad87f29989f9d9465d_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a1fc3d553947a5cad87f29989f9d9465d_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a1fc3d553947a5cad87f29989f9d9465d_cgraph" id="anamespacenz_1_1krnl_a1fc3d553947a5cad87f29989f9d9465d_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of Binary Cross Entropy (BCE) loss for backpropagation." alt="" coords="5,47,164,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="212,5,397,48"/>
<area shape="poly" title=" " alt="" coords="164,45,196,40,196,46,165,51"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="212,72,397,115"/>
<area shape="poly" title=" " alt="" coords="165,69,196,74,196,80,164,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="445,39,630,81"/>
<area shape="poly" title=" " alt="" coords="397,77,429,73,430,78,398,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="445,105,630,148"/>
<area shape="poly" title=" " alt="" coords="398,104,430,109,429,114,397,109"/>
</map>
</div>

</div>
</div>
<a id="abf927faf0950fbc215564c67b8ac57be" name="abf927faf0950fbc215564c67b8ac57be"></a>
<h2 class="memtitle"><span class="permalink"><a href="#abf927faf0950fbc215564c67b8ac57be">&#9670;&#160;</a></span>BinaryCrossEntropy()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::BinaryCrossEntropy </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>sharedMemSize</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>predict</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>real</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the Binary Cross Entropy (BCE) loss between predicted and real values. </p>
<p>This function computes the Binary Cross Entropy loss between the predicted and real values for each element in the input arrays and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">sharedMemSize</td><td>The size of the shared memory buffer used by the kernel </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the BCE result will be stored </td></tr>
    <tr><td class="paramname">predict</td><td>Pointer to the predicted values </td></tr>
    <tr><td class="paramname">real</td><td>Pointer to the real values </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00686">686</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_abf927faf0950fbc215564c67b8ac57be_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_abf927faf0950fbc215564c67b8ac57be_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_abf927faf0950fbc215564c67b8ac57be_cgraph" id="anamespacenz_1_1krnl_abf927faf0950fbc215564c67b8ac57be_cgraph">
<area shape="rect" title="Kernel function to compute the Binary Cross Entropy (BCE) loss between predicted and real values." alt="" coords="5,47,195,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="243,5,428,48"/>
<area shape="poly" title=" " alt="" coords="195,44,227,39,228,45,196,49"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="243,72,428,115"/>
<area shape="poly" title=" " alt="" coords="196,71,228,75,227,81,195,76"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="476,39,661,81"/>
<area shape="poly" title=" " alt="" coords="428,77,460,73,461,78,429,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="476,105,661,148"/>
<area shape="poly" title=" " alt="" coords="429,104,461,109,460,114,428,109"/>
</map>
</div>

</div>
</div>
<a id="a7c061f5511c3ab9d36563757bd969ff7" name="a7c061f5511c3ab9d36563757bd969ff7"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a7c061f5511c3ab9d36563757bd969ff7">&#9670;&#160;</a></span>col2img()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::col2img </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>C_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>batches</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Rearranges columnar data back into image format. </p>
<p>This kernel function transforms columnar data into its original image format. It is typically used in operations where data needs to be reconstructed from a columnar representation, such as after convolution operations.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the reconstructed image data will be stored. </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input columnar data array. </td></tr>
    <tr><td class="paramname">H_out</td><td>The height of the output image. </td></tr>
    <tr><td class="paramname">W_out</td><td>The width of the output image. </td></tr>
    <tr><td class="paramname">C_out</td><td>The number of output channels. </td></tr>
    <tr><td class="paramname">batches</td><td>The number of images in the batch.</td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>This function assumes that the input and output arrays are properly allocated and accessible on the device. </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01378">1378</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a7c061f5511c3ab9d36563757bd969ff7_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a7c061f5511c3ab9d36563757bd969ff7_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a7c061f5511c3ab9d36563757bd969ff7_cgraph" id="anamespacenz_1_1krnl_a7c061f5511c3ab9d36563757bd969ff7_cgraph">
<area shape="rect" title="Rearranges columnar data back into image format." alt="" coords="5,47,123,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="171,5,356,48"/>
<area shape="poly" title=" " alt="" coords="123,48,155,42,156,47,124,53"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="171,72,356,115"/>
<area shape="poly" title=" " alt="" coords="124,67,156,73,155,78,123,72"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="404,39,589,81"/>
<area shape="poly" title=" " alt="" coords="356,77,388,73,389,78,357,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="404,105,589,148"/>
<area shape="poly" title=" " alt="" coords="357,104,389,109,388,114,356,109"/>
</map>
</div>

</div>
</div>
<a id="a028970809074d79f28ff94f62b3edaa4" name="a028970809074d79f28ff94f62b3edaa4"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a028970809074d79f28ff94f62b3edaa4">&#9670;&#160;</a></span>col2imgBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::col2imgBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>C_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>batches</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Rearranges columnar data back into image format for backpropagation. </p>
<p>This kernel function transforms columnar data back into its original image format. It is typically used during the backpropagation phase of convolutional neural networks to reconstruct the gradient of the input image.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the reconstructed image data will be stored. </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input columnar data array. </td></tr>
    <tr><td class="paramname">H_out</td><td>The height of the output image. </td></tr>
    <tr><td class="paramname">W_out</td><td>The width of the output image. </td></tr>
    <tr><td class="paramname">C_out</td><td>The number of output channels. </td></tr>
    <tr><td class="paramname">batches</td><td>The number of images in the batch.</td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>This function assumes that the input and output arrays are properly allocated and accessible on the device. </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01398">1398</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a028970809074d79f28ff94f62b3edaa4_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a028970809074d79f28ff94f62b3edaa4_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a028970809074d79f28ff94f62b3edaa4_cgraph" id="anamespacenz_1_1krnl_a028970809074d79f28ff94f62b3edaa4_cgraph">
<area shape="rect" title="Rearranges columnar data back into image format for backpropagation." alt="" coords="5,47,181,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="229,5,414,48"/>
<area shape="poly" title=" " alt="" coords="181,45,213,40,213,45,181,50"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="229,72,414,115"/>
<area shape="poly" title=" " alt="" coords="181,70,213,75,213,80,181,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="462,39,647,81"/>
<area shape="poly" title=" " alt="" coords="414,77,446,73,447,78,415,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="462,105,647,148"/>
<area shape="poly" title=" " alt="" coords="415,104,447,109,446,114,414,109"/>
</map>
</div>

</div>
</div>
<a id="a454a28ef0e22014efca1ede4e954db65" name="a454a28ef0e22014efca1ede4e954db65"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a454a28ef0e22014efca1ede4e954db65">&#9670;&#160;</a></span>Compress()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Compress </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>total</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Compresses the input array into the output array with a specified total size. </p>
<p>This kernel function reduces the size of the input array by compressing its elements into the output array to match the specified total size.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the compressed data will be stored. </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array containing the original data. </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input array. </td></tr>
    <tr><td class="paramname">total</td><td>The total number of elements in the output array after compression.</td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>This function assumes that the input and output arrays are properly allocated and accessible on the device. </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01303">1303</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a454a28ef0e22014efca1ede4e954db65_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a454a28ef0e22014efca1ede4e954db65_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a454a28ef0e22014efca1ede4e954db65_cgraph" id="anamespacenz_1_1krnl_a454a28ef0e22014efca1ede4e954db65_cgraph">
<area shape="rect" title="Compresses the input array into the output array with a specified total size." alt="" coords="5,47,138,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="186,5,371,48"/>
<area shape="poly" title=" " alt="" coords="138,47,170,41,171,47,138,52"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="186,72,371,115"/>
<area shape="poly" title=" " alt="" coords="138,68,171,73,170,79,138,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="419,39,604,81"/>
<area shape="poly" title=" " alt="" coords="371,77,403,73,404,78,372,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="419,105,604,148"/>
<area shape="poly" title=" " alt="" coords="372,104,404,109,403,114,371,109"/>
</map>
</div>

</div>
</div>
<a id="aa61cded4977bb2dc3720f7057cc2fb47" name="aa61cded4977bb2dc3720f7057cc2fb47"></a>
<h2 class="memtitle"><span class="permalink"><a href="#aa61cded4977bb2dc3720f7057cc2fb47">&#9670;&#160;</a></span>ElementwiseDivide()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::ElementwiseDivide </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in1</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in2</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset_o</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset_1</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset_2</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform element-wise division of two arrays. </p>
<p>This function performs element-wise division of two input arrays and stores the result in an output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array </td></tr>
    <tr><td class="paramname">in1</td><td>Pointer to the first input array </td></tr>
    <tr><td class="paramname">in2</td><td>Pointerto the second input array </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the arrays </td></tr>
    <tr><td class="paramname">offset_o</td><td></td></tr>
    <tr><td class="paramname">offset_1</td><td></td></tr>
    <tr><td class="paramname">offset_2</td><td></td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>This function is used for computing the element-wise division of two arrays. </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01181">1181</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_aa61cded4977bb2dc3720f7057cc2fb47_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_aa61cded4977bb2dc3720f7057cc2fb47_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_aa61cded4977bb2dc3720f7057cc2fb47_cgraph" id="anamespacenz_1_1krnl_aa61cded4977bb2dc3720f7057cc2fb47_cgraph">
<area shape="rect" title="Kernel function to perform element&#45;wise division of two arrays." alt="" coords="5,47,190,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="238,5,423,48"/>
<area shape="poly" title=" " alt="" coords="189,44,222,40,222,45,190,49"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="238,72,423,115"/>
<area shape="poly" title=" " alt="" coords="190,71,222,75,222,80,189,76"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="471,39,656,81"/>
<area shape="poly" title=" " alt="" coords="423,77,455,73,456,78,424,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="471,105,656,148"/>
<area shape="poly" title=" " alt="" coords="424,104,456,109,455,114,423,109"/>
</map>
</div>

</div>
</div>
<a id="aee8ca471aa260bd1fca5b1797e229f9f" name="aee8ca471aa260bd1fca5b1797e229f9f"></a>
<h2 class="memtitle"><span class="permalink"><a href="#aee8ca471aa260bd1fca5b1797e229f9f">&#9670;&#160;</a></span>ELUBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::ELUBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>alpha</em></span><span class="paramdefsep"> = </span><span class="paramdefval">1.0f</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of the ELU activation during backpropagation. </p>
<p>This function computes the gradient of the ELU activation function during backpropagation and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">A_grad</td><td>Pointer to the output array where the gradient result will be stored </td></tr>
    <tr><td class="paramname">A</td><td>Pointer to the input array elements (before activation) </td></tr>
    <tr><td class="paramname">B_grad</td><td>Pointer to the gradient of the next layer </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the arrays </td></tr>
    <tr><td class="paramname">alpha</td><td>The alpha parameter used for negative values (default 1.0) </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00388">388</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_aee8ca471aa260bd1fca5b1797e229f9f_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_aee8ca471aa260bd1fca5b1797e229f9f_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_aee8ca471aa260bd1fca5b1797e229f9f_cgraph" id="anamespacenz_1_1krnl_aee8ca471aa260bd1fca5b1797e229f9f_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of the ELU activation during backpropagation." alt="" coords="5,47,162,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="210,5,395,48"/>
<area shape="poly" title=" " alt="" coords="162,45,194,41,195,46,163,51"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="210,72,395,115"/>
<area shape="poly" title=" " alt="" coords="163,69,195,74,194,79,162,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="443,39,628,81"/>
<area shape="poly" title=" " alt="" coords="395,77,427,73,428,78,396,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="443,105,628,148"/>
<area shape="poly" title=" " alt="" coords="396,104,428,109,427,114,395,109"/>
</map>
</div>

</div>
</div>
<a id="ae45dbebceb76ddf82fa5e6b9df882e62" name="ae45dbebceb76ddf82fa5e6b9df882e62"></a>
<h2 class="memtitle"><span class="permalink"><a href="#ae45dbebceb76ddf82fa5e6b9df882e62">&#9670;&#160;</a></span>Expand()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Expand </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>total</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Expands the input array into the output array with a specified total size. </p>
<p>This kernel function takes an input array and expands it into an output array by repeating or padding elements to match the specified total size.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the expanded data will be stored. </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array containing the original data. </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input array. </td></tr>
    <tr><td class="paramname">total</td><td>The total number of elements in the output array after expansion.</td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>This function assumes that the input and output arrays are properly allocated and accessible on the device. </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01290">1290</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_ae45dbebceb76ddf82fa5e6b9df882e62_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_ae45dbebceb76ddf82fa5e6b9df882e62_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_ae45dbebceb76ddf82fa5e6b9df882e62_cgraph" id="anamespacenz_1_1krnl_ae45dbebceb76ddf82fa5e6b9df882e62_cgraph">
<area shape="rect" title="Expands the input array into the output array with a specified total size." alt="" coords="5,47,122,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="170,5,355,48"/>
<area shape="poly" title=" " alt="" coords="122,48,154,42,155,47,123,53"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="170,72,355,115"/>
<area shape="poly" title=" " alt="" coords="123,67,155,73,154,78,122,72"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="403,39,588,81"/>
<area shape="poly" title=" " alt="" coords="355,77,387,73,388,78,356,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="403,105,588,148"/>
<area shape="poly" title=" " alt="" coords="356,104,388,109,387,114,355,109"/>
</map>
</div>

</div>
</div>
<a id="a0e82aca250b46ac8ded8cae8936d7e38" name="a0e82aca250b46ac8ded8cae8936d7e38"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a0e82aca250b46ac8ded8cae8936d7e38">&#9670;&#160;</a></span>ExponentialLinearUnit()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::ExponentialLinearUnit </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>alpha</em></span><span class="paramdefsep"> = </span><span class="paramdefval">1.0f</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply the Exponential Linear Unit (ELU) activation function on the GPU. </p>
<p>This function applies the ELU activation function (x if x &gt; 0, alpha * (exp(x) - 1) if x &lt;= 0) to each element of the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the ELU result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
    <tr><td class="paramname">alpha</td><td>The alpha parameter used for negative values (default 1.0) </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00372">372</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a0e82aca250b46ac8ded8cae8936d7e38_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a0e82aca250b46ac8ded8cae8936d7e38_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a0e82aca250b46ac8ded8cae8936d7e38_cgraph" id="anamespacenz_1_1krnl_a0e82aca250b46ac8ded8cae8936d7e38_cgraph">
<area shape="rect" title="Kernel function to apply the Exponential Linear Unit (ELU) activation function on the GPU." alt="" coords="5,47,205,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="253,5,438,48"/>
<area shape="poly" title=" " alt="" coords="205,44,237,39,237,44,205,49"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="253,72,438,115"/>
<area shape="poly" title=" " alt="" coords="205,71,237,76,237,81,205,76"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="486,39,671,81"/>
<area shape="poly" title=" " alt="" coords="438,77,470,73,471,78,439,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="486,105,671,148"/>
<area shape="poly" title=" " alt="" coords="439,104,471,109,470,114,438,109"/>
</map>
</div>

</div>
</div>
<a id="ad136c8a6560a5305984ce0a31bea71bf" name="ad136c8a6560a5305984ce0a31bea71bf"></a>
<h2 class="memtitle"><span class="permalink"><a href="#ad136c8a6560a5305984ce0a31bea71bf">&#9670;&#160;</a></span>Fill()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Fill </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>value</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to fill a data array with a given value. </p>
<p>This function fills a data array with a specified value.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">data</td><td>Pointer to the data array that will be filled </td></tr>
    <tr><td class="paramname">value</td><td>The value to fill the array with </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the data array </td></tr>
    <tr><td class="paramname">offset</td><td></td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>This function is used for initializing the data array with a given value. </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01153">1153</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_ad136c8a6560a5305984ce0a31bea71bf_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_ad136c8a6560a5305984ce0a31bea71bf_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_ad136c8a6560a5305984ce0a31bea71bf_cgraph" id="anamespacenz_1_1krnl_ad136c8a6560a5305984ce0a31bea71bf_cgraph">
<area shape="rect" title="Kernel function to fill a data array with a given value." alt="" coords="5,47,95,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="143,5,328,48"/>
<area shape="poly" title=" " alt="" coords="95,49,127,44,128,49,96,55"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="143,72,328,115"/>
<area shape="poly" title=" " alt="" coords="96,65,128,71,127,76,95,71"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="376,39,561,81"/>
<area shape="poly" title=" " alt="" coords="328,77,360,73,361,78,329,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="376,105,561,148"/>
<area shape="poly" title=" " alt="" coords="329,104,361,109,360,114,328,109"/>
</map>
</div>

</div>
</div>
<a id="aa3720ebf4ae0cc9f4abbd1e32842191b" name="aa3720ebf4ae0cc9f4abbd1e32842191b"></a>
<h2 class="memtitle"><span class="permalink"><a href="#aa3720ebf4ae0cc9f4abbd1e32842191b">&#9670;&#160;</a></span>GeneralMatrixMul() <span class="overload">[1/2]</span></h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::GeneralMatrixMul </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>C</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>M</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>N</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>K</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_c</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_a</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_b</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform single-precision matrix multiplication on GPU using CUDA cores. </p>
<p>This function is designed to execute general matrix multiplication using CUDA technology, leveraging the parallel computing capabilities of the GPU for efficient processing of large datasets. It performs single-precision (FP32) matrix multiplication on the CUDA cores, taking two input arrays of floats and storing their product in a third array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">A</td><td>Pointer to the first input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">B</td><td>Pointer to the second input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">C</td><td>Pointer to the output matrix where the result will be stored, allocated by the caller </td></tr>
    <tr><td class="paramname">M</td><td>The number of rows in matrix A and matrix C </td></tr>
    <tr><td class="paramname">N</td><td>The number of columns in matrix B and matrix C </td></tr>
    <tr><td class="paramname">K</td><td>The number of columns in matrix A and rows in matrix B </td></tr>
    <tr><td class="paramname">offset_c</td><td></td></tr>
    <tr><td class="paramname">offset_a</td><td></td></tr>
    <tr><td class="paramname">offset_b</td><td></td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00114">114</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_aa3720ebf4ae0cc9f4abbd1e32842191b_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_aa3720ebf4ae0cc9f4abbd1e32842191b_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_aa3720ebf4ae0cc9f4abbd1e32842191b_cgraph" id="anamespacenz_1_1krnl_aa3720ebf4ae0cc9f4abbd1e32842191b_cgraph">
<area shape="rect" title="Kernel function to perform single&#45;precision matrix multiplication on GPU using CUDA cores." alt="" coords="5,13,180,40"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="228,5,413,48"/>
<area shape="poly" title=" " alt="" coords="180,24,212,24,212,29,180,29"/>
</map>
</div>

</div>
</div>
<a id="ae30a6e1de69588aa0c6eb8a5b8e6e826" name="ae30a6e1de69588aa0c6eb8a5b8e6e826"></a>
<h2 class="memtitle"><span class="permalink"><a href="#ae30a6e1de69588aa0c6eb8a5b8e6e826">&#9670;&#160;</a></span>GeneralMatrixMul() <span class="overload">[2/2]</span></h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::GeneralMatrixMul </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>C</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>M</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>N</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>K</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset_c</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset_a</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset_b</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform single-precision matrix multiplication on GPU using CUDA cores. </p>
<p>This function is designed to execute general matrix multiplication using CUDA technology, leveraging the parallel computing capabilities of the GPU for efficient processing of large datasets. It performs single-precision (FP32) matrix multiplication on the CUDA cores, taking two input arrays of floats and storing their product in a third array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">A</td><td>Pointer to the first input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">B</td><td>Pointer to the second input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">C</td><td>Pointer to the output matrix where the result will be stored, allocated by the caller </td></tr>
    <tr><td class="paramname">M</td><td>The number of rows in matrix A and matrix C </td></tr>
    <tr><td class="paramname">N</td><td>The number of columns in matrix B and matrix C </td></tr>
    <tr><td class="paramname">K</td><td>The number of columns in matrix A and rows in matrix B </td></tr>
    <tr><td class="paramname">offset_c</td><td></td></tr>
    <tr><td class="paramname">offset_a</td><td></td></tr>
    <tr><td class="paramname">offset_b</td><td></td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00103">103</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_ae30a6e1de69588aa0c6eb8a5b8e6e826_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_ae30a6e1de69588aa0c6eb8a5b8e6e826_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_ae30a6e1de69588aa0c6eb8a5b8e6e826_cgraph" id="anamespacenz_1_1krnl_ae30a6e1de69588aa0c6eb8a5b8e6e826_cgraph">
<area shape="rect" title="Kernel function to perform single&#45;precision matrix multiplication on GPU using CUDA cores." alt="" coords="5,47,180,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="228,5,413,48"/>
<area shape="poly" title=" " alt="" coords="180,45,212,40,213,45,181,50"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="228,72,413,115"/>
<area shape="poly" title=" " alt="" coords="181,70,213,75,212,80,180,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="461,39,646,81"/>
<area shape="poly" title=" " alt="" coords="413,77,445,73,446,78,414,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="461,105,646,148"/>
<area shape="poly" title=" " alt="" coords="414,104,446,109,445,114,413,109"/>
</map>
</div>

</div>
</div>
<a id="a73ceb77688c4008dc350fc87b99875aa" name="a73ceb77688c4008dc350fc87b99875aa"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a73ceb77688c4008dc350fc87b99875aa">&#9670;&#160;</a></span>GlobalAvgPoolBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::GlobalAvgPoolBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>output</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>batches</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>channels</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>height</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>width</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of global average pooling during backpropagation. </p>
<p>This function computes the gradient of the global average pooling operation, distributing the gradient values evenly across all spatial dimensions.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">output</td><td>Pointer to the output array where the gradient will be stored. </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array containing the gradient from the next layer. </td></tr>
    <tr><td class="paramname">batches</td><td>The number of batches in the input tensor. </td></tr>
    <tr><td class="paramname">channels</td><td>The number of channels in the input tensor. </td></tr>
    <tr><td class="paramname">height</td><td>The height of the input tensor. </td></tr>
    <tr><td class="paramname">width</td><td>The width of the input tensor. </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01502">1502</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a73ceb77688c4008dc350fc87b99875aa_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a73ceb77688c4008dc350fc87b99875aa_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a73ceb77688c4008dc350fc87b99875aa_cgraph" id="anamespacenz_1_1krnl_a73ceb77688c4008dc350fc87b99875aa_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of global average pooling during backpropagation." alt="" coords="5,47,222,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="270,5,455,48"/>
<area shape="poly" title=" " alt="" coords="218,43,254,38,255,44,219,49"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="270,72,455,115"/>
<area shape="poly" title=" " alt="" coords="219,71,255,76,254,82,218,77"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="503,39,688,81"/>
<area shape="poly" title=" " alt="" coords="455,77,487,73,488,78,456,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="503,105,688,148"/>
<area shape="poly" title=" " alt="" coords="456,104,488,109,487,114,455,109"/>
</map>
</div>

</div>
</div>
<a id="a0ed44a68bfb86a9fd3d6c3b25614713f" name="a0ed44a68bfb86a9fd3d6c3b25614713f"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a0ed44a68bfb86a9fd3d6c3b25614713f">&#9670;&#160;</a></span>gradCopy()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::gradCopy </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_o</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_i</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Copies gradient data from one array to another with specified offsets. </p>
<p>This kernel function performs a gradient copy operation, transferring data from the input array to the output array while applying offsets for both the input and output arrays.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the gradient data will be stored. </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array containing the gradient data to be copied. </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements to copy. </td></tr>
    <tr><td class="paramname">offset_o</td><td>A vector of offsets for the output array. </td></tr>
    <tr><td class="paramname">offset_i</td><td>A vector of offsets for the input array.</td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>This function is designed for use in GPU-based gradient operations and assumes that the input and output arrays are properly allocated and accessible on the device. </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01238">1238</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a0ed44a68bfb86a9fd3d6c3b25614713f_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a0ed44a68bfb86a9fd3d6c3b25614713f_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a0ed44a68bfb86a9fd3d6c3b25614713f_cgraph" id="anamespacenz_1_1krnl_a0ed44a68bfb86a9fd3d6c3b25614713f_cgraph">
<area shape="rect" title="Copies gradient data from one array to another with specified offsets." alt="" coords="5,113,134,140"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="182,5,367,48"/>
<area shape="poly" title=" " alt="" coords="90,111,131,85,180,58,193,52,195,57,183,62,133,89,92,115"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="182,72,367,115"/>
<area shape="poly" title=" " alt="" coords="134,114,166,108,167,114,135,119"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1260d95d0eddf75b72700da07361a4bd" title="Records write completion event for asynchronous data operations." alt="" coords="182,139,367,181"/>
<area shape="poly" title=" " alt="" coords="135,134,167,140,166,145,134,140"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="182,205,367,248"/>
<area shape="poly" title=" " alt="" coords="92,138,133,164,183,191,195,197,193,201,180,196,131,169,90,143"/>
</map>
</div>

</div>
</div>
<a id="a8ec4524fdefd3d771c72e77e94281c88" name="a8ec4524fdefd3d771c72e77e94281c88"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a8ec4524fdefd3d771c72e77e94281c88">&#9670;&#160;</a></span>HadamardProduct()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::HadamardProduct </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in1</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in2</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform element-wise Hadamard product of two arrays. </p>
<p>This function performs element-wise Hadamard product of two input arrays and stores the result in an output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array </td></tr>
    <tr><td class="paramname">in1</td><td>Pointer to the first input array </td></tr>
    <tr><td class="paramname">in2</td><td>Pointerto the second input array </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the arrays</td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>This function is used for computing the element-wise Hadamard product of two arrays. </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01165">1165</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a8ec4524fdefd3d771c72e77e94281c88_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a8ec4524fdefd3d771c72e77e94281c88_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a8ec4524fdefd3d771c72e77e94281c88_cgraph" id="anamespacenz_1_1krnl_a8ec4524fdefd3d771c72e77e94281c88_cgraph">
<area shape="rect" title="Kernel function to perform element&#45;wise Hadamard product of two arrays." alt="" coords="5,47,183,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="231,5,416,48"/>
<area shape="poly" title=" " alt="" coords="183,44,215,40,215,45,183,50"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="231,72,416,115"/>
<area shape="poly" title=" " alt="" coords="183,70,215,75,215,80,183,76"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="464,39,649,81"/>
<area shape="poly" title=" " alt="" coords="416,77,448,73,449,78,417,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="464,105,649,148"/>
<area shape="poly" title=" " alt="" coords="417,104,449,109,448,114,416,109"/>
</map>
</div>

</div>
</div>
<a id="a52e449285e560185378234aecaf2f87c" name="a52e449285e560185378234aecaf2f87c"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a52e449285e560185378234aecaf2f87c">&#9670;&#160;</a></span>HardSigmoid()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::HardSigmoid </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>alpha</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0.2f</span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>beta</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0.5f</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply the Hard Sigmoid activation function on the GPU. </p>
<p>This function applies the Hard Sigmoid activation function (min(max(alpha * x + beta, 0), 1)) to each element of the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the Hard Sigmoid result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
    <tr><td class="paramname">alpha</td><td>The slope of the Hard Sigmoid (default 0.2) </td></tr>
    <tr><td class="paramname">beta</td><td>The offset of the Hard Sigmoid (default 0.5) </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00403">403</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a52e449285e560185378234aecaf2f87c_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a52e449285e560185378234aecaf2f87c_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a52e449285e560185378234aecaf2f87c_cgraph" id="anamespacenz_1_1krnl_a52e449285e560185378234aecaf2f87c_cgraph">
<area shape="rect" title="Kernel function to apply the Hard Sigmoid activation function on the GPU." alt="" coords="5,47,153,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="201,5,386,48"/>
<area shape="poly" title=" " alt="" coords="153,46,185,41,186,46,154,51"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="201,72,386,115"/>
<area shape="poly" title=" " alt="" coords="154,69,186,74,185,79,153,74"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="434,39,619,81"/>
<area shape="poly" title=" " alt="" coords="386,77,418,73,419,78,387,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="434,105,619,148"/>
<area shape="poly" title=" " alt="" coords="387,104,419,109,418,114,386,109"/>
</map>
</div>

</div>
</div>
<a id="a43232f9472ad3b974351e59386208efa" name="a43232f9472ad3b974351e59386208efa"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a43232f9472ad3b974351e59386208efa">&#9670;&#160;</a></span>HardSigmoidBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::HardSigmoidBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>alpha</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0.2f</span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>beta</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0.5f</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of the Hard Sigmoid activation during backpropagation. </p>
<p>This function computes the gradient of the Hard Sigmoid activation function during backpropagation and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">A_grad</td><td>Pointer to the output array where the gradient result will be stored </td></tr>
    <tr><td class="paramname">A</td><td>Pointer to the input array elements (before activation) </td></tr>
    <tr><td class="paramname">B_grad</td><td>Pointer to the gradient of the next layer </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the arrays </td></tr>
    <tr><td class="paramname">alpha</td><td>The slope of the Hard Sigmoid (default 0.2) </td></tr>
    <tr><td class="paramname">beta</td><td>The offset of the Hard Sigmoid (default 0.5) </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00424">424</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a43232f9472ad3b974351e59386208efa_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a43232f9472ad3b974351e59386208efa_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a43232f9472ad3b974351e59386208efa_cgraph" id="anamespacenz_1_1krnl_a43232f9472ad3b974351e59386208efa_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of the Hard Sigmoid activation during backpropagation." alt="" coords="5,47,211,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="259,5,444,48"/>
<area shape="poly" title=" " alt="" coords="210,43,243,39,243,44,211,49"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="259,72,444,115"/>
<area shape="poly" title=" " alt="" coords="211,71,243,76,243,81,210,77"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="492,39,677,81"/>
<area shape="poly" title=" " alt="" coords="444,77,476,73,477,78,445,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="492,105,677,148"/>
<area shape="poly" title=" " alt="" coords="445,104,477,109,476,114,444,109"/>
</map>
</div>

</div>
</div>
<a id="aef9c028ed356b5684e103639bb23bcf0" name="aef9c028ed356b5684e103639bb23bcf0"></a>
<h2 class="memtitle"><span class="permalink"><a href="#aef9c028ed356b5684e103639bb23bcf0">&#9670;&#160;</a></span>HardSwish()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::HardSwish </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>alpha</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0.2f</span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>beta</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0.5f</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply the Hard Swish activation function on the GPU. </p>
<p>This function applies the Hard Swish activation function (x * HardSigmoid(x)) to each element of the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the Hard Swish result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
    <tr><td class="paramname">alpha</td><td>The slope of the Hard Sigmoid (default 0.2) </td></tr>
    <tr><td class="paramname">beta</td><td>The offset of the Hard Sigmoid (default 0.5) </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00445">445</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_aef9c028ed356b5684e103639bb23bcf0_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_aef9c028ed356b5684e103639bb23bcf0_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_aef9c028ed356b5684e103639bb23bcf0_cgraph" id="anamespacenz_1_1krnl_aef9c028ed356b5684e103639bb23bcf0_cgraph">
<area shape="rect" title="Kernel function to apply the Hard Swish activation function on the GPU." alt="" coords="5,47,142,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="190,5,375,48"/>
<area shape="poly" title=" " alt="" coords="141,47,174,41,175,47,142,52"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="190,72,375,115"/>
<area shape="poly" title=" " alt="" coords="142,68,175,73,174,79,141,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="423,39,608,81"/>
<area shape="poly" title=" " alt="" coords="375,77,407,73,408,78,376,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="423,105,608,148"/>
<area shape="poly" title=" " alt="" coords="376,104,408,109,407,114,375,109"/>
</map>
</div>

</div>
</div>
<a id="a455365870d43ff26687a731d15c4cdff" name="a455365870d43ff26687a731d15c4cdff"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a455365870d43ff26687a731d15c4cdff">&#9670;&#160;</a></span>HardSwishBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::HardSwishBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>alpha</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0.2f</span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>beta</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0.5f</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of the Hard Swish activation during backpropagation. </p>
<p>This function computes the gradient of the Hard Swish activation function during backpropagation and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">A_grad</td><td>Pointer to the output array where the gradient result will be stored </td></tr>
    <tr><td class="paramname">A</td><td>Pointer to the input array elements (before activation) </td></tr>
    <tr><td class="paramname">B_grad</td><td>Pointer to the gradient of the next layer </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the arrays </td></tr>
    <tr><td class="paramname">alpha</td><td>The slope of the Hard Sigmoid (default 0.2) </td></tr>
    <tr><td class="paramname">beta</td><td>The offset of the Hard Sigmoid (default 0.5) </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00462">462</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a455365870d43ff26687a731d15c4cdff_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a455365870d43ff26687a731d15c4cdff_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a455365870d43ff26687a731d15c4cdff_cgraph" id="anamespacenz_1_1krnl_a455365870d43ff26687a731d15c4cdff_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of the Hard Swish activation during backpropagation." alt="" coords="5,47,200,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="248,5,433,48"/>
<area shape="poly" title=" " alt="" coords="200,44,232,39,233,44,201,49"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="248,72,433,115"/>
<area shape="poly" title=" " alt="" coords="201,71,233,76,232,81,200,76"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="481,39,666,81"/>
<area shape="poly" title=" " alt="" coords="433,77,465,73,466,78,434,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="481,105,666,148"/>
<area shape="poly" title=" " alt="" coords="434,104,466,109,465,114,433,109"/>
</map>
</div>

</div>
</div>
<a id="a3a781324400c54c35dd564f3599dca8e" name="a3a781324400c54c35dd564f3599dca8e"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a3a781324400c54c35dd564f3599dca8e">&#9670;&#160;</a></span>img2col()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::img2col </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>C</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>K_h</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>K_w</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>stride</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>pad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>batch</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Rearranges image data into column format for convolution operations. </p>
<p>This kernel function transforms the input image data into a columnar format (im2col) to facilitate efficient convolution operations. It extracts patches from the input image based on the kernel size, stride, and padding, and stores them in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the columnar data will be stored. </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input image data array. </td></tr>
    <tr><td class="paramname">H_out</td><td>The height of the output feature map. </td></tr>
    <tr><td class="paramname">W_out</td><td>The width of the output feature map. </td></tr>
    <tr><td class="paramname">C</td><td>The number of input channels. </td></tr>
    <tr><td class="paramname">K_h</td><td>The height of the convolution kernel. </td></tr>
    <tr><td class="paramname">K_w</td><td>The width of the convolution kernel. </td></tr>
    <tr><td class="paramname">stride</td><td>The stride of the convolution operation. </td></tr>
    <tr><td class="paramname">pad</td><td>The padding applied to the input image. </td></tr>
    <tr><td class="paramname">H_in</td><td>The height of the input image. </td></tr>
    <tr><td class="paramname">W_in</td><td>The width of the input image. </td></tr>
    <tr><td class="paramname">batch</td><td>The number of images in the batch.</td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>This function assumes that the input and output arrays are properly allocated and accessible on the device. </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01330">1330</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a3a781324400c54c35dd564f3599dca8e_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a3a781324400c54c35dd564f3599dca8e_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a3a781324400c54c35dd564f3599dca8e_cgraph" id="anamespacenz_1_1krnl_a3a781324400c54c35dd564f3599dca8e_cgraph">
<area shape="rect" title="Rearranges image data into column format for convolution operations." alt="" coords="5,47,123,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="171,5,356,48"/>
<area shape="poly" title=" " alt="" coords="123,48,155,42,156,47,124,53"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="171,72,356,115"/>
<area shape="poly" title=" " alt="" coords="124,67,156,73,155,78,123,72"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="404,39,589,81"/>
<area shape="poly" title=" " alt="" coords="356,77,388,73,389,78,357,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="404,105,589,148"/>
<area shape="poly" title=" " alt="" coords="357,104,389,109,388,114,356,109"/>
</map>
</div>

</div>
</div>
<a id="a1c2b7a6f28d2af22f9a2623c5ae62bff" name="a1c2b7a6f28d2af22f9a2623c5ae62bff"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a1c2b7a6f28d2af22f9a2623c5ae62bff">&#9670;&#160;</a></span>img2colBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::img2colBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>C</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>K_h</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>K_w</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>stride</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>pad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>batch</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Rearranges columnar data back into image format for backpropagation in convolution operations. </p>
<p>This kernel function performs the reverse operation of <code>img2col</code>, transforming columnar data back into its original image format. It is used during the backpropagation phase of convolutional neural networks to reconstruct the gradient of the input image.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the reconstructed image data will be stored. </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input columnar data array. </td></tr>
    <tr><td class="paramname">H_out</td><td>The height of the output feature map. </td></tr>
    <tr><td class="paramname">W_out</td><td>The width of the output feature map. </td></tr>
    <tr><td class="paramname">C</td><td>The number of input channels. </td></tr>
    <tr><td class="paramname">K_h</td><td>The height of the convolution kernel. </td></tr>
    <tr><td class="paramname">K_w</td><td>The width of the convolution kernel. </td></tr>
    <tr><td class="paramname">stride</td><td>The stride of the convolution operation. </td></tr>
    <tr><td class="paramname">pad</td><td>The padding applied to the input image. </td></tr>
    <tr><td class="paramname">H_in</td><td>The height of the input image. </td></tr>
    <tr><td class="paramname">W_in</td><td>The width of the input image. </td></tr>
    <tr><td class="paramname">batch</td><td>The number of images in the batch.</td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>This function assumes that the input and output arrays are properly allocated and accessible on the device. </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01357">1357</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a1c2b7a6f28d2af22f9a2623c5ae62bff_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a1c2b7a6f28d2af22f9a2623c5ae62bff_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a1c2b7a6f28d2af22f9a2623c5ae62bff_cgraph" id="anamespacenz_1_1krnl_a1c2b7a6f28d2af22f9a2623c5ae62bff_cgraph">
<area shape="rect" title="Rearranges columnar data back into image format for backpropagation in convolution operations." alt="" coords="5,47,181,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="229,5,414,48"/>
<area shape="poly" title=" " alt="" coords="181,45,213,40,213,45,181,50"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="229,72,414,115"/>
<area shape="poly" title=" " alt="" coords="181,70,213,75,213,80,181,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="462,39,647,81"/>
<area shape="poly" title=" " alt="" coords="414,77,446,73,447,78,415,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="462,105,647,148"/>
<area shape="poly" title=" " alt="" coords="415,104,447,109,446,114,414,109"/>
</map>
</div>

</div>
</div>
<a id="a04246c5218530f789a0ed4811b7ef3f3" name="a04246c5218530f789a0ed4811b7ef3f3"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a04246c5218530f789a0ed4811b7ef3f3">&#9670;&#160;</a></span>LeakyReLU()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::LeakyReLU </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>alpha</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0.01f</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply the Leaky ReLU activation function on the GPU. </p>
<p>This function applies the Leaky ReLU activation function (max(alpha * x, x)) to each element of the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the Leaky ReLU result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
    <tr><td class="paramname">alpha</td><td>The slope of the negative part of the Leaky ReLU (default 0.01) </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00315">315</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a04246c5218530f789a0ed4811b7ef3f3_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a04246c5218530f789a0ed4811b7ef3f3_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a04246c5218530f789a0ed4811b7ef3f3_cgraph" id="anamespacenz_1_1krnl_a04246c5218530f789a0ed4811b7ef3f3_cgraph">
<area shape="rect" title="Kernel function to apply the Leaky ReLU activation function on the GPU." alt="" coords="5,47,147,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="195,5,380,48"/>
<area shape="poly" title=" " alt="" coords="146,46,179,41,180,46,147,52"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="195,72,380,115"/>
<area shape="poly" title=" " alt="" coords="147,68,180,74,179,79,146,74"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="428,39,613,81"/>
<area shape="poly" title=" " alt="" coords="380,77,412,73,413,78,381,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="428,105,613,148"/>
<area shape="poly" title=" " alt="" coords="381,104,413,109,412,114,380,109"/>
</map>
</div>

</div>
</div>
<a id="a7eade95ddcf48141d69bb19803b22d51" name="a7eade95ddcf48141d69bb19803b22d51"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a7eade95ddcf48141d69bb19803b22d51">&#9670;&#160;</a></span>LeakyReLUBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::LeakyReLUBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>alpha</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0.01f</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of the Leaky ReLU activation during backpropagation. </p>
<p>This function computes the gradient of the Leaky ReLU activation function during backpropagation (dL/dx = dL/dy * (x &gt; 0 ? 1 : alpha)) and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">A_grad</td><td>Pointer to the output array where the gradient result will be stored </td></tr>
    <tr><td class="paramname">A</td><td>Pointer to the input array elements (before activation) </td></tr>
    <tr><td class="paramname">B_grad</td><td>Pointer to the gradient of the next layer </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the arrays </td></tr>
    <tr><td class="paramname">alpha</td><td>The slope of the negative part of the Leaky ReLU (default 0.01) </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00330">330</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a7eade95ddcf48141d69bb19803b22d51_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a7eade95ddcf48141d69bb19803b22d51_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a7eade95ddcf48141d69bb19803b22d51_cgraph" id="anamespacenz_1_1krnl_a7eade95ddcf48141d69bb19803b22d51_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of the Leaky ReLU activation during backpropagation." alt="" coords="5,47,205,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="253,5,438,48"/>
<area shape="poly" title=" " alt="" coords="205,44,237,39,237,44,205,49"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="253,72,438,115"/>
<area shape="poly" title=" " alt="" coords="205,71,237,76,237,81,205,76"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="486,39,671,81"/>
<area shape="poly" title=" " alt="" coords="438,77,470,73,471,78,439,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="486,105,671,148"/>
<area shape="poly" title=" " alt="" coords="439,104,471,109,470,114,438,109"/>
</map>
</div>

</div>
</div>
<a id="a5b29c405a1df9534430ad8682960ebb5" name="a5b29c405a1df9534430ad8682960ebb5"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a5b29c405a1df9534430ad8682960ebb5">&#9670;&#160;</a></span>MatrixAdd() <span class="overload">[1/2]</span></h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::MatrixAdd </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>a</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>b</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>c</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_c</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_a</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_b</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform matrix addition on GPU. </p>
<p>This function is designed to execute matrix addition using CUDA technology, leveraging parallel computing capabilities of the GPU for efficient processing of large datasets. It takes two input arrays of floats and stores their sum in a third array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">a</td><td>Pointer to the first input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">b</td><td>Pointer to the second input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">c</td><td>Pointer to the output matrix where the result will be stored, allocated by the caller </td></tr>
    <tr><td class="paramname">n</td><td>The size of the matrix, representing the number of elements along one dimension (for a square matrix, total elements are n*n) </td></tr>
    <tr><td class="paramname">offset_c</td><td></td></tr>
    <tr><td class="paramname">offset_a</td><td></td></tr>
    <tr><td class="paramname">offset_b</td><td></td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00032">32</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a5b29c405a1df9534430ad8682960ebb5_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a5b29c405a1df9534430ad8682960ebb5_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a5b29c405a1df9534430ad8682960ebb5_cgraph" id="anamespacenz_1_1krnl_a5b29c405a1df9534430ad8682960ebb5_cgraph">
<area shape="rect" title="Kernel function to perform matrix addition on GPU." alt="" coords="5,13,137,40"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="185,5,370,48"/>
<area shape="poly" title=" " alt="" coords="137,24,169,24,169,29,137,29"/>
</map>
</div>

</div>
</div>
<a id="a97cda6dfc6545efaee2b686eed9ae766" name="a97cda6dfc6545efaee2b686eed9ae766"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a97cda6dfc6545efaee2b686eed9ae766">&#9670;&#160;</a></span>MatrixAdd() <span class="overload">[2/2]</span></h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::MatrixAdd </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>a</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>b</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>c</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset_c</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset_a</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset_b</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform matrix addition on GPU. </p>
<p>This function is designed to execute matrix addition using CUDA technology, leveraging parallel computing capabilities of the GPU for efficient processing of large datasets. It takes two input arrays of floats and stores their sum in a third array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">a</td><td>Pointer to the first input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">b</td><td>Pointer to the second input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">c</td><td>Pointer to the output matrix where the result will be stored, allocated by the caller </td></tr>
    <tr><td class="paramname">n</td><td>The size of the matrix, representing the number of elements along one dimension (for a square matrix, total elements are n*n) </td></tr>
    <tr><td class="paramname">offset_c</td><td></td></tr>
    <tr><td class="paramname">offset_a</td><td></td></tr>
    <tr><td class="paramname">offset_b</td><td></td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00026">26</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a97cda6dfc6545efaee2b686eed9ae766_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a97cda6dfc6545efaee2b686eed9ae766_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a97cda6dfc6545efaee2b686eed9ae766_cgraph" id="anamespacenz_1_1krnl_a97cda6dfc6545efaee2b686eed9ae766_cgraph">
<area shape="rect" title="Kernel function to perform matrix addition on GPU." alt="" coords="5,47,137,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="185,5,370,48"/>
<area shape="poly" title=" " alt="" coords="136,47,169,42,169,47,137,52"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="185,72,370,115"/>
<area shape="poly" title=" " alt="" coords="137,68,169,73,169,78,136,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="418,39,603,81"/>
<area shape="poly" title=" " alt="" coords="370,77,402,73,403,78,371,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="418,105,603,148"/>
<area shape="poly" title=" " alt="" coords="371,104,403,109,402,114,370,109"/>
</map>
</div>

</div>
</div>
<a id="a4ca041c74dc55e3ac9124b5fd39b985c" name="a4ca041c74dc55e3ac9124b5fd39b985c"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a4ca041c74dc55e3ac9124b5fd39b985c">&#9670;&#160;</a></span>MatrixSub() <span class="overload">[1/2]</span></h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::MatrixSub </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>a</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>b</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>c</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_c</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_a</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_b</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform matrix subtraction on GPU. </p>
<p>This function is designed to execute matrix subtraction using CUDA technology, leveraging parallel computing capabilities of the GPU for efficient processing of large datasets. It takes two input arrays of floats and stores their difference in a third array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td></td></tr>
    <tr><td class="paramname">blockDim</td><td></td></tr>
    <tr><td class="paramname">a</td><td>Pointer to the first input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">b</td><td>Pointer to the second input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">c</td><td>Pointer to the output matrix where the result will be stored, allocated by the caller </td></tr>
    <tr><td class="paramname">n</td><td>The size of the matrix, representing the number of elements along one dimension (for a square matrix, total elements are n*n) </td></tr>
    <tr><td class="paramname">offset_c</td><td></td></tr>
    <tr><td class="paramname">offset_a</td><td></td></tr>
    <tr><td class="paramname">offset_b</td><td></td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00058">58</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a4ca041c74dc55e3ac9124b5fd39b985c_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a4ca041c74dc55e3ac9124b5fd39b985c_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a4ca041c74dc55e3ac9124b5fd39b985c_cgraph" id="anamespacenz_1_1krnl_a4ca041c74dc55e3ac9124b5fd39b985c_cgraph">
<area shape="rect" title="Kernel function to perform matrix subtraction on GPU." alt="" coords="5,13,137,40"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="185,5,370,48"/>
<area shape="poly" title=" " alt="" coords="137,24,169,24,169,29,137,29"/>
</map>
</div>

</div>
</div>
<a id="ad18a2b0efc0cdfc9cb861396ad4da53f" name="ad18a2b0efc0cdfc9cb861396ad4da53f"></a>
<h2 class="memtitle"><span class="permalink"><a href="#ad18a2b0efc0cdfc9cb861396ad4da53f">&#9670;&#160;</a></span>MatrixSub() <span class="overload">[2/2]</span></h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::MatrixSub </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>a</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>b</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>c</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset_c</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset_a</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset_b</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform matrix subtraction on GPU. </p>
<p>This function is designed to execute matrix subtraction using CUDA technology, leveraging parallel computing capabilities of the GPU for efficient processing of large datasets. It takes two input arrays of floats and stores their difference in a third array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td></td></tr>
    <tr><td class="paramname">blockDim</td><td></td></tr>
    <tr><td class="paramname">a</td><td>Pointer to the first input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">b</td><td>Pointer to the second input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">c</td><td>Pointer to the output matrix where the result will be stored, allocated by the caller </td></tr>
    <tr><td class="paramname">n</td><td>The size of the matrix, representing the number of elements along one dimension (for a square matrix, total elements are n*n) </td></tr>
    <tr><td class="paramname">offset_c</td><td></td></tr>
    <tr><td class="paramname">offset_a</td><td></td></tr>
    <tr><td class="paramname">offset_b</td><td></td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00050">50</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_ad18a2b0efc0cdfc9cb861396ad4da53f_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_ad18a2b0efc0cdfc9cb861396ad4da53f_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_ad18a2b0efc0cdfc9cb861396ad4da53f_cgraph" id="anamespacenz_1_1krnl_ad18a2b0efc0cdfc9cb861396ad4da53f_cgraph">
<area shape="rect" title="Kernel function to perform matrix subtraction on GPU." alt="" coords="5,47,137,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="185,5,370,48"/>
<area shape="poly" title=" " alt="" coords="136,47,169,42,169,47,137,52"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="185,72,370,115"/>
<area shape="poly" title=" " alt="" coords="137,68,169,73,169,78,136,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="418,39,603,81"/>
<area shape="poly" title=" " alt="" coords="370,77,402,73,403,78,371,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="418,105,603,148"/>
<area shape="poly" title=" " alt="" coords="371,104,403,109,402,114,370,109"/>
</map>
</div>

</div>
</div>
<a id="abcc632e5a7104c1a28208e94a4ce6e28" name="abcc632e5a7104c1a28208e94a4ce6e28"></a>
<h2 class="memtitle"><span class="permalink"><a href="#abcc632e5a7104c1a28208e94a4ce6e28">&#9670;&#160;</a></span>MaxPooling()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::MaxPooling </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>output</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>position</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>input</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>pool_size</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>stride</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>padding</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>batches</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>channels</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_out</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform max pooling on the GPU. </p>
<p>This function applies max pooling to the input tensor, reducing its spatial dimensions by selecting the maximum value within each pooling window.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">output</td><td>Pointer to the output array where the pooled results will be stored. </td></tr>
    <tr><td class="paramname">position</td><td>Pointer to the array where the positions of the maximum values will be stored. </td></tr>
    <tr><td class="paramname">input</td><td>Pointer to the input array containing the original data. </td></tr>
    <tr><td class="paramname">pool_size</td><td>The size of the pooling window. </td></tr>
    <tr><td class="paramname">stride</td><td>The stride of the pooling operation. </td></tr>
    <tr><td class="paramname">padding</td><td>The padding applied to the input tensor. </td></tr>
    <tr><td class="paramname">batches</td><td>The number of batches in the input tensor. </td></tr>
    <tr><td class="paramname">channels</td><td>The number of channels in the input tensor. </td></tr>
    <tr><td class="paramname">H_in</td><td>The height of the input tensor. </td></tr>
    <tr><td class="paramname">W_in</td><td>The width of the input tensor. </td></tr>
    <tr><td class="paramname">H_out</td><td>The height of the output tensor. </td></tr>
    <tr><td class="paramname">W_out</td><td>The width of the output tensor. </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01539">1539</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_abcc632e5a7104c1a28208e94a4ce6e28_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_abcc632e5a7104c1a28208e94a4ce6e28_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_abcc632e5a7104c1a28208e94a4ce6e28_cgraph" id="anamespacenz_1_1krnl_abcc632e5a7104c1a28208e94a4ce6e28_cgraph">
<area shape="rect" title="Kernel function to perform max pooling on the GPU." alt="" coords="5,13,146,40"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="194,5,379,48"/>
<area shape="poly" title=" " alt="" coords="146,24,178,24,178,29,146,29"/>
</map>
</div>

</div>
</div>
<a id="a0d5f5f4c9e89a8d914a7f2f802d1caab" name="a0d5f5f4c9e89a8d914a7f2f802d1caab"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a0d5f5f4c9e89a8d914a7f2f802d1caab">&#9670;&#160;</a></span>MaxPoolingBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::MaxPoolingBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>output</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>position</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>input</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>pool_size</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>stride</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>padding</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>batches</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>channels</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>H_out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>W_out</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of max pooling during backpropagation. </p>
<p>This function computes the gradient of the max pooling operation, propagating the gradient values only to the positions of the maximum values in the pooling window.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">output</td><td>Pointer to the output array where the gradient will be stored. </td></tr>
    <tr><td class="paramname">position</td><td>Pointer to the array containing the positions of the maximum values. </td></tr>
    <tr><td class="paramname">input</td><td>Pointer to the input array containing the gradient from the next layer. </td></tr>
    <tr><td class="paramname">pool_size</td><td>The size of the pooling window. </td></tr>
    <tr><td class="paramname">stride</td><td>The stride of the pooling operation. </td></tr>
    <tr><td class="paramname">padding</td><td>The padding applied to the input tensor. </td></tr>
    <tr><td class="paramname">batches</td><td>The number of batches in the input tensor. </td></tr>
    <tr><td class="paramname">channels</td><td>The number of channels in the input tensor. </td></tr>
    <tr><td class="paramname">H_in</td><td>The height of the input tensor. </td></tr>
    <tr><td class="paramname">W_in</td><td>The width of the input tensor. </td></tr>
    <tr><td class="paramname">H_out</td><td>The height of the output tensor. </td></tr>
    <tr><td class="paramname">W_out</td><td>The width of the output tensor. </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01567">1567</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a0d5f5f4c9e89a8d914a7f2f802d1caab_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a0d5f5f4c9e89a8d914a7f2f802d1caab_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a0d5f5f4c9e89a8d914a7f2f802d1caab_cgraph" id="anamespacenz_1_1krnl_a0d5f5f4c9e89a8d914a7f2f802d1caab_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of max pooling during backpropagation." alt="" coords="5,47,204,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="252,5,437,48"/>
<area shape="poly" title=" " alt="" coords="204,44,236,39,237,44,204,49"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="252,72,437,115"/>
<area shape="poly" title=" " alt="" coords="204,71,237,76,236,81,204,76"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="485,39,670,81"/>
<area shape="poly" title=" " alt="" coords="437,77,469,73,470,78,438,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="485,105,670,148"/>
<area shape="poly" title=" " alt="" coords="438,104,470,109,469,114,437,109"/>
</map>
</div>

</div>
</div>
<a id="af76ce6a930db4def5ceb51350af72f3c" name="af76ce6a930db4def5ceb51350af72f3c"></a>
<h2 class="memtitle"><span class="permalink"><a href="#af76ce6a930db4def5ceb51350af72f3c">&#9670;&#160;</a></span>MeanSquaredError()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::MeanSquaredError </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>sharedMemSize</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>predict</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>real</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the Mean Squared Error (MSE) loss between predicted and real values. </p>
<p>This function computes the Mean Squared Error loss between the predicted and real values for each element in the input arrays and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">sharedMemSize</td><td>The size of the shared memory buffer used by the kernel </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the MSE result will be stored </td></tr>
    <tr><td class="paramname">predict</td><td>Pointer to the predicted values </td></tr>
    <tr><td class="paramname">real</td><td>Pointer to the real values </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00615">615</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_af76ce6a930db4def5ceb51350af72f3c_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_af76ce6a930db4def5ceb51350af72f3c_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_af76ce6a930db4def5ceb51350af72f3c_cgraph" id="anamespacenz_1_1krnl_af76ce6a930db4def5ceb51350af72f3c_cgraph">
<area shape="rect" title="Kernel function to compute the Mean Squared Error (MSE) loss between predicted and real values." alt="" coords="5,47,186,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="234,5,419,48"/>
<area shape="poly" title=" " alt="" coords="185,44,218,40,219,45,186,50"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="234,72,419,115"/>
<area shape="poly" title=" " alt="" coords="186,70,219,75,218,80,185,76"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="467,39,652,81"/>
<area shape="poly" title=" " alt="" coords="419,77,451,73,452,78,420,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="467,105,652,148"/>
<area shape="poly" title=" " alt="" coords="420,104,452,109,451,114,419,109"/>
</map>
</div>

</div>
</div>
<a id="a273ef3023442a864f1028becaf236bae" name="a273ef3023442a864f1028becaf236bae"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a273ef3023442a864f1028becaf236bae">&#9670;&#160;</a></span>Momentum()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Momentum </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>output</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>velocity</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>beta</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply Momentum optimization. </p>
<p>This function updates the output array using the Momentum optimization method, which incorporates the previous velocity to smooth the gradient update.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">output</td><td>Pointer to the output array that will be updated </td></tr>
    <tr><td class="paramname">grad</td><td>Pointer to the gradient array </td></tr>
    <tr><td class="paramname">velocity</td><td>Pointer to the previous velocity array </td></tr>
    <tr><td class="paramname">beta</td><td>The momentum factor (typically between 0.9 and 0.99) </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the output, gradient, and velocity arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00715">715</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a273ef3023442a864f1028becaf236bae_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a273ef3023442a864f1028becaf236bae_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a273ef3023442a864f1028becaf236bae_cgraph" id="anamespacenz_1_1krnl_a273ef3023442a864f1028becaf236bae_cgraph">
<area shape="rect" title="Kernel function to apply Momentum optimization." alt="" coords="5,47,143,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="191,5,376,48"/>
<area shape="poly" title=" " alt="" coords="143,46,175,41,176,47,143,52"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="191,72,376,115"/>
<area shape="poly" title=" " alt="" coords="143,68,176,73,175,79,143,74"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="424,39,609,81"/>
<area shape="poly" title=" " alt="" coords="376,77,408,73,409,78,377,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="424,105,609,148"/>
<area shape="poly" title=" " alt="" coords="377,104,409,109,408,114,376,109"/>
</map>
</div>

</div>
</div>
<a id="ae77920db6adf79a17dbfb1dbf1ab5656" name="ae77920db6adf79a17dbfb1dbf1ab5656"></a>
<h2 class="memtitle"><span class="permalink"><a href="#ae77920db6adf79a17dbfb1dbf1ab5656">&#9670;&#160;</a></span>MSEBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::MSEBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>predict</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>real</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of the Mean Squared Error (MSE) loss for backpropagation. </p>
<p>This function computes the gradient of the Mean Squared Error loss between the predicted and real values for each element in the input arrays and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the MSE gradient will be stored </td></tr>
    <tr><td class="paramname">predict</td><td>Pointer to the predicted values </td></tr>
    <tr><td class="paramname">real</td><td>Pointer to the real values </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00629">629</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_ae77920db6adf79a17dbfb1dbf1ab5656_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_ae77920db6adf79a17dbfb1dbf1ab5656_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_ae77920db6adf79a17dbfb1dbf1ab5656_cgraph" id="anamespacenz_1_1krnl_ae77920db6adf79a17dbfb1dbf1ab5656_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of the Mean Squared Error (MSE) loss for backpropagation." alt="" coords="5,47,165,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="213,5,398,48"/>
<area shape="poly" title=" " alt="" coords="165,45,197,40,198,46,165,51"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="213,72,398,115"/>
<area shape="poly" title=" " alt="" coords="165,69,198,74,197,80,165,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="446,39,631,81"/>
<area shape="poly" title=" " alt="" coords="398,77,430,73,431,78,399,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="446,105,631,148"/>
<area shape="poly" title=" " alt="" coords="399,104,431,109,430,114,398,109"/>
</map>
</div>

</div>
</div>
<a id="ada94b8c5c6e6d72132face63a3305624" name="ada94b8c5c6e6d72132face63a3305624"></a>
<h2 class="memtitle"><span class="permalink"><a href="#ada94b8c5c6e6d72132face63a3305624">&#9670;&#160;</a></span>NAdam()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::NAdam </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>m</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>m_modified</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>v</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>lr</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>beta1</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>beta2</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>eps</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">int</td>          <td class="paramname"><span class="paramname"><em>t</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply NAdam optimization. </p>
<p>This function updates the data array using NAdam optimization, which combines Adam with Nesterov momentum.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">data</td><td>Pointer to the data array that will be updated </td></tr>
    <tr><td class="paramname">m</td><td>Pointer to the first moment estimate (mean of gradients) </td></tr>
    <tr><td class="paramname">m_modified</td><td>Pointer to the modified first moment estimate for Nesterov momentum </td></tr>
    <tr><td class="paramname">v</td><td>Pointer to the second moment estimate (variance of gradients) </td></tr>
    <tr><td class="paramname">grad</td><td>Pointer to the gradient array </td></tr>
    <tr><td class="paramname">lr</td><td>The learning rate used for the gradient update </td></tr>
    <tr><td class="paramname">beta1</td><td>The exponential decay rate for the first moment estimate (default 0.9) </td></tr>
    <tr><td class="paramname">beta2</td><td>The exponential decay rate for the second moment estimate (default 0.999) </td></tr>
    <tr><td class="paramname">eps</td><td>A small constant to avoid division by zero (default 1e-8) </td></tr>
    <tr><td class="paramname">t</td><td>The current time step or iteration </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the data, gradient, and moment arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00793">793</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_ada94b8c5c6e6d72132face63a3305624_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_ada94b8c5c6e6d72132face63a3305624_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_ada94b8c5c6e6d72132face63a3305624_cgraph" id="anamespacenz_1_1krnl_ada94b8c5c6e6d72132face63a3305624_cgraph">
<area shape="rect" title="Kernel function to apply NAdam optimization." alt="" coords="5,13,122,40"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="170,5,355,48"/>
<area shape="poly" title=" " alt="" coords="122,24,154,24,154,29,122,29"/>
</map>
</div>

</div>
</div>
<a id="af7069a420e81babb49b1bc009333d053" name="af7069a420e81babb49b1bc009333d053"></a>
<h2 class="memtitle"><span class="permalink"><a href="#af7069a420e81babb49b1bc009333d053">&#9670;&#160;</a></span>Negation()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Negation </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to negate each element of a matrix on the GPU. </p>
<p>This function negates each element of the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the negated result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00209">209</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_af7069a420e81babb49b1bc009333d053_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_af7069a420e81babb49b1bc009333d053_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_af7069a420e81babb49b1bc009333d053_cgraph" id="anamespacenz_1_1krnl_af7069a420e81babb49b1bc009333d053_cgraph">
<area shape="rect" title="Kernel function to negate each element of a matrix on the GPU." alt="" coords="5,47,130,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="178,5,363,48"/>
<area shape="poly" title=" " alt="" coords="130,47,162,42,163,47,130,52"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="178,72,363,115"/>
<area shape="poly" title=" " alt="" coords="130,68,163,73,162,78,130,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="411,39,596,81"/>
<area shape="poly" title=" " alt="" coords="363,77,395,73,396,78,364,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="411,105,596,148"/>
<area shape="poly" title=" " alt="" coords="364,104,396,109,395,114,363,109"/>
</map>
</div>

</div>
</div>
<a id="a9ac0590fbb5eb7f51b05da574e9845a8" name="a9ac0590fbb5eb7f51b05da574e9845a8"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a9ac0590fbb5eb7f51b05da574e9845a8">&#9670;&#160;</a></span>NgradCopy()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::NgradCopy </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_o</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset_i</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Copies gradient data from one array to another with specified offsets. </p>
<p>This kernel function performs a gradient copy operation, transferring data from the input array to the output array while applying offsets for both the input and output arrays.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration. </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the gradient data will be stored. </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array containing the gradient data to be copied. </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements to copy. </td></tr>
    <tr><td class="paramname">offset_o</td><td>A vector of offsets for the output array. </td></tr>
    <tr><td class="paramname">offset_i</td><td>A vector of offsets for the input array.</td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>This function is designed for use in GPU-based gradient operations and assumes that the input and output arrays are properly allocated and accessible on the device. </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01264">1264</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a9ac0590fbb5eb7f51b05da574e9845a8_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a9ac0590fbb5eb7f51b05da574e9845a8_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a9ac0590fbb5eb7f51b05da574e9845a8_cgraph" id="anamespacenz_1_1krnl_a9ac0590fbb5eb7f51b05da574e9845a8_cgraph">
<area shape="rect" title="Copies gradient data from one array to another with specified offsets." alt="" coords="5,113,144,140"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="192,5,377,48"/>
<area shape="poly" title=" " alt="" coords="96,110,139,85,191,58,203,52,206,57,193,62,142,89,99,115"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="192,72,377,115"/>
<area shape="poly" title=" " alt="" coords="144,113,176,108,177,113,145,118"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1260d95d0eddf75b72700da07361a4bd" title="Records write completion event for asynchronous data operations." alt="" coords="192,139,377,181"/>
<area shape="poly" title=" " alt="" coords="145,135,177,140,176,145,144,140"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="192,205,377,248"/>
<area shape="poly" title=" " alt="" coords="99,138,142,164,193,191,206,197,203,201,191,196,139,169,96,143"/>
</map>
</div>

</div>
</div>
<a id="adc047e65307dbc711235f637227b7d10" name="adc047e65307dbc711235f637227b7d10"></a>
<h2 class="memtitle"><span class="permalink"><a href="#adc047e65307dbc711235f637227b7d10">&#9670;&#160;</a></span>Recip()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Recip </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the reciprocal of each element of a matrix on the GPU. </p>
<p>This function computes the reciprocal (1/x) of each element of the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the reciprocal result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00226">226</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_adc047e65307dbc711235f637227b7d10_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_adc047e65307dbc711235f637227b7d10_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_adc047e65307dbc711235f637227b7d10_cgraph" id="anamespacenz_1_1krnl_adc047e65307dbc711235f637227b7d10_cgraph">
<area shape="rect" title="Kernel function to compute the reciprocal of each element of a matrix on the GPU." alt="" coords="5,47,112,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="160,5,345,48"/>
<area shape="poly" title=" " alt="" coords="112,48,144,43,145,48,113,54"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="160,72,345,115"/>
<area shape="poly" title=" " alt="" coords="113,66,145,72,144,77,112,72"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="393,39,578,81"/>
<area shape="poly" title=" " alt="" coords="345,77,377,73,378,78,346,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="393,105,578,148"/>
<area shape="poly" title=" " alt="" coords="346,104,378,109,377,114,345,109"/>
</map>
</div>

</div>
</div>
<a id="a8855f411733f7de29d013f4ad40096c9" name="a8855f411733f7de29d013f4ad40096c9"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a8855f411733f7de29d013f4ad40096c9">&#9670;&#160;</a></span>RectifiedLinearUnit()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::RectifiedLinearUnit </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply the Rectified Linear Unit (ReLU) activation on the GPU. </p>
<p>This function applies the ReLU activation function (max(0, x)) to each element of the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the ReLU result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00237">237</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a8855f411733f7de29d013f4ad40096c9_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a8855f411733f7de29d013f4ad40096c9_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a8855f411733f7de29d013f4ad40096c9_cgraph" id="anamespacenz_1_1krnl_a8855f411733f7de29d013f4ad40096c9_cgraph">
<area shape="rect" title="Kernel function to apply the Rectified Linear Unit (ReLU) activation on the GPU." alt="" coords="5,47,189,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="237,5,422,48"/>
<area shape="poly" title=" " alt="" coords="189,44,221,40,221,45,189,49"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="237,72,422,115"/>
<area shape="poly" title=" " alt="" coords="189,71,221,75,221,80,189,76"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="470,39,655,81"/>
<area shape="poly" title=" " alt="" coords="422,77,454,73,455,78,423,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="470,105,655,148"/>
<area shape="poly" title=" " alt="" coords="423,104,455,109,454,114,422,109"/>
</map>
</div>

</div>
</div>
<a id="a4ddfc808de99fe831e74a3bd3f9bbdaf" name="a4ddfc808de99fe831e74a3bd3f9bbdaf"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a4ddfc808de99fe831e74a3bd3f9bbdaf">&#9670;&#160;</a></span>ReLUBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::ReLUBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of the ReLU activation during backpropagation. </p>
<p>This function computes the gradient of the ReLU activation function during backpropagation (dL/dx = dL/dy * (x &gt; 0)) and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">A_grad</td><td>Pointer to the output array where the gradient result will be stored </td></tr>
    <tr><td class="paramname">A</td><td>Pointer to the input array elements (before activation) </td></tr>
    <tr><td class="paramname">B_grad</td><td>Pointer to the gradient of the next layer </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00250">250</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a4ddfc808de99fe831e74a3bd3f9bbdaf_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a4ddfc808de99fe831e74a3bd3f9bbdaf_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a4ddfc808de99fe831e74a3bd3f9bbdaf_cgraph" id="anamespacenz_1_1krnl_a4ddfc808de99fe831e74a3bd3f9bbdaf_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of the ReLU activation during backpropagation." alt="" coords="5,47,170,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="218,5,403,48"/>
<area shape="poly" title=" " alt="" coords="170,45,202,40,203,45,171,50"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="218,72,403,115"/>
<area shape="poly" title=" " alt="" coords="171,70,203,75,202,80,170,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="451,39,636,81"/>
<area shape="poly" title=" " alt="" coords="403,77,435,73,436,78,404,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="451,105,636,148"/>
<area shape="poly" title=" " alt="" coords="404,104,436,109,435,114,403,109"/>
</map>
</div>

</div>
</div>
<a id="aaf3c9cca114d003130ffa4354b4a24de" name="aaf3c9cca114d003130ffa4354b4a24de"></a>
<h2 class="memtitle"><span class="permalink"><a href="#aaf3c9cca114d003130ffa4354b4a24de">&#9670;&#160;</a></span>RMSprop()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::RMSprop </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>v</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>lr</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>beta</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>eps</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply RMSprop optimization. </p>
<p>This function updates the data array using RMSprop optimization, which divides the gradient by the moving average of the squared gradient values.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">data</td><td>Pointer to the data array that will be updated </td></tr>
    <tr><td class="paramname">v</td><td>Pointer to the array of accumulated squared gradients </td></tr>
    <tr><td class="paramname">grad</td><td>Pointer to the gradient array </td></tr>
    <tr><td class="paramname">lr</td><td>The learning rate used for the gradient update </td></tr>
    <tr><td class="paramname">beta</td><td>The smoothing factor (typically between 0.9 and 0.99) </td></tr>
    <tr><td class="paramname">eps</td><td>A small constant to avoid division by zero (default 1e-8) </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the data, gradient, and accumulated squared gradient arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00747">747</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_aaf3c9cca114d003130ffa4354b4a24de_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_aaf3c9cca114d003130ffa4354b4a24de_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_aaf3c9cca114d003130ffa4354b4a24de_cgraph" id="anamespacenz_1_1krnl_aaf3c9cca114d003130ffa4354b4a24de_cgraph">
<area shape="rect" title="Kernel function to apply RMSprop optimization." alt="" coords="5,13,133,40"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="181,5,366,48"/>
<area shape="poly" title=" " alt="" coords="133,24,165,24,165,29,133,29"/>
</map>
</div>

</div>
</div>
<a id="a56f84e531825be8b2b0974c2488eb765" name="a56f84e531825be8b2b0974c2488eb765"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a56f84e531825be8b2b0974c2488eb765">&#9670;&#160;</a></span>ScalarAdd()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::ScalarAdd </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>num</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to add a scalar to each element of a matrix on the GPU. </p>
<p>This function adds a scalar value to each element of the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">num</td><td>The scalar value to add to each element of the input array </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00196">196</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a56f84e531825be8b2b0974c2488eb765_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a56f84e531825be8b2b0974c2488eb765_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a56f84e531825be8b2b0974c2488eb765_cgraph" id="anamespacenz_1_1krnl_a56f84e531825be8b2b0974c2488eb765_cgraph">
<area shape="rect" title="Kernel function to add a scalar to each element of a matrix on the GPU." alt="" coords="5,47,138,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="186,5,371,48"/>
<area shape="poly" title=" " alt="" coords="138,47,170,41,171,47,138,52"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="186,72,371,115"/>
<area shape="poly" title=" " alt="" coords="138,68,171,73,170,79,138,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="419,39,604,81"/>
<area shape="poly" title=" " alt="" coords="371,77,403,73,404,78,372,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="419,105,604,148"/>
<area shape="poly" title=" " alt="" coords="372,104,404,109,403,114,371,109"/>
</map>
</div>

</div>
</div>
<a id="a27bc4025be4253d5fffae2bf1b43b3af" name="a27bc4025be4253d5fffae2bf1b43b3af"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a27bc4025be4253d5fffae2bf1b43b3af">&#9670;&#160;</a></span>ScalarDiv()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::ScalarDiv </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>num</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform scalar division on the GPU. </p>
<p>This function divides each element of the input array by a scalar value and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">num</td><td>The scalar value to divide each element of the input array by </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00183">183</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a27bc4025be4253d5fffae2bf1b43b3af_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a27bc4025be4253d5fffae2bf1b43b3af_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a27bc4025be4253d5fffae2bf1b43b3af_cgraph" id="anamespacenz_1_1krnl_a27bc4025be4253d5fffae2bf1b43b3af_cgraph">
<area shape="rect" title="Kernel function to perform scalar division on the GPU." alt="" coords="5,47,135,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="183,5,368,48"/>
<area shape="poly" title=" " alt="" coords="134,47,167,42,168,47,135,52"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="183,72,368,115"/>
<area shape="poly" title=" " alt="" coords="135,68,168,73,167,78,134,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="416,39,601,81"/>
<area shape="poly" title=" " alt="" coords="368,77,400,73,401,78,369,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="416,105,601,148"/>
<area shape="poly" title=" " alt="" coords="369,104,401,109,400,114,368,109"/>
</map>
</div>

</div>
</div>
<a id="a5af716524e248c61f3dce227d8ef6e34" name="a5af716524e248c61f3dce227d8ef6e34"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a5af716524e248c61f3dce227d8ef6e34">&#9670;&#160;</a></span>ScalarMul()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::ScalarMul </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>num</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform scalar multiplication on the GPU. </p>
<p>This function multiplies each element of the input array by a scalar value and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">num</td><td>The scalar value to multiply each element of the input array by </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00170">170</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a5af716524e248c61f3dce227d8ef6e34_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a5af716524e248c61f3dce227d8ef6e34_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a5af716524e248c61f3dce227d8ef6e34_cgraph" id="anamespacenz_1_1krnl_a5af716524e248c61f3dce227d8ef6e34_cgraph">
<area shape="rect" title="Kernel function to perform scalar multiplication on the GPU." alt="" coords="5,47,136,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="184,5,369,48"/>
<area shape="poly" title=" " alt="" coords="136,47,168,42,169,47,137,52"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="184,72,369,115"/>
<area shape="poly" title=" " alt="" coords="137,68,169,73,168,78,136,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="417,39,602,81"/>
<area shape="poly" title=" " alt="" coords="369,77,401,73,402,78,370,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="417,105,602,148"/>
<area shape="poly" title=" " alt="" coords="370,104,402,109,401,114,369,109"/>
</map>
</div>

</div>
</div>
<a id="a21bbbcf6d97bfaccc828ce7736814bd4" name="a21bbbcf6d97bfaccc828ce7736814bd4"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a21bbbcf6d97bfaccc828ce7736814bd4">&#9670;&#160;</a></span>Sigmoid()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Sigmoid </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply the Sigmoid activation function on the GPU. </p>
<p>This function applies the Sigmoid activation function (1 / (1 + exp(-x))) to each element of the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the Sigmoid result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00263">263</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a21bbbcf6d97bfaccc828ce7736814bd4_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a21bbbcf6d97bfaccc828ce7736814bd4_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a21bbbcf6d97bfaccc828ce7736814bd4_cgraph" id="anamespacenz_1_1krnl_a21bbbcf6d97bfaccc828ce7736814bd4_cgraph">
<area shape="rect" title="Kernel function to apply the Sigmoid activation function on the GPU." alt="" coords="5,47,125,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="173,5,358,48"/>
<area shape="poly" title=" " alt="" coords="125,47,157,42,158,47,126,53"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="173,72,358,115"/>
<area shape="poly" title=" " alt="" coords="126,67,158,73,157,78,125,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="406,39,591,81"/>
<area shape="poly" title=" " alt="" coords="358,77,390,73,391,78,359,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="406,105,591,148"/>
<area shape="poly" title=" " alt="" coords="359,104,391,109,390,114,358,109"/>
</map>
</div>

</div>
</div>
<a id="aff1f9f1bf9fb677024bd2b565fab9801" name="aff1f9f1bf9fb677024bd2b565fab9801"></a>
<h2 class="memtitle"><span class="permalink"><a href="#aff1f9f1bf9fb677024bd2b565fab9801">&#9670;&#160;</a></span>SigmoidBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::SigmoidBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of the Sigmoid activation during backpropagation. </p>
<p>This function computes the gradient of the Sigmoid activation function during backpropagation (dL/dx = dL/dy * sigmoid(x) * (1 - sigmoid(x))) and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">A_grad</td><td>Pointer to the output array where the gradient result will be stored </td></tr>
    <tr><td class="paramname">B</td><td>Pointer to the input array elements (after activation) </td></tr>
    <tr><td class="paramname">B_grad</td><td>Pointer to the gradient of the next layer </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00277">277</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_aff1f9f1bf9fb677024bd2b565fab9801_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_aff1f9f1bf9fb677024bd2b565fab9801_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_aff1f9f1bf9fb677024bd2b565fab9801_cgraph" id="anamespacenz_1_1krnl_aff1f9f1bf9fb677024bd2b565fab9801_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of the Sigmoid activation during backpropagation." alt="" coords="5,47,183,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="231,5,416,48"/>
<area shape="poly" title=" " alt="" coords="183,44,215,40,215,45,183,50"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="231,72,416,115"/>
<area shape="poly" title=" " alt="" coords="183,70,215,75,215,80,183,76"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="464,39,649,81"/>
<area shape="poly" title=" " alt="" coords="416,77,448,73,449,78,417,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="464,105,649,148"/>
<area shape="poly" title=" " alt="" coords="417,104,449,109,448,114,416,109"/>
</map>
</div>

</div>
</div>
<a id="adbafc409d57fa0a9d78ecac5bf7b10a3" name="adbafc409d57fa0a9d78ecac5bf7b10a3"></a>
<h2 class="memtitle"><span class="permalink"><a href="#adbafc409d57fa0a9d78ecac5bf7b10a3">&#9670;&#160;</a></span>Softmax()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Softmax </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>exp_sum_of_input</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply the Softmax function on the GPU. </p>
<p>This function applies the Softmax activation function, which normalizes the input values by exponentiating them and dividing by the sum of all exponentials, to each element of the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the Softmax result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">exp_sum_of_input</td><td>The sum of the exponentials of the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
    <tr><td class="paramname">offset</td><td></td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00525">525</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_adbafc409d57fa0a9d78ecac5bf7b10a3_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_adbafc409d57fa0a9d78ecac5bf7b10a3_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_adbafc409d57fa0a9d78ecac5bf7b10a3_cgraph" id="anamespacenz_1_1krnl_adbafc409d57fa0a9d78ecac5bf7b10a3_cgraph">
<area shape="rect" title="Kernel function to apply the Softmax function on the GPU." alt="" coords="5,47,127,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="175,5,360,48"/>
<area shape="poly" title=" " alt="" coords="126,47,159,42,159,47,127,53"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="175,72,360,115"/>
<area shape="poly" title=" " alt="" coords="127,67,159,73,159,78,126,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="408,39,593,81"/>
<area shape="poly" title=" " alt="" coords="360,77,392,73,393,78,361,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="408,105,593,148"/>
<area shape="poly" title=" " alt="" coords="361,104,393,109,392,114,360,109"/>
</map>
</div>

</div>
</div>
<a id="a4375738c83ef892783abc210578e5b39" name="a4375738c83ef892783abc210578e5b39"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a4375738c83ef892783abc210578e5b39">&#9670;&#160;</a></span>SoftmaxJacobian()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::SoftmaxJacobian </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the Jacobian of the Softmax function. </p>
<p>This function computes the Jacobian matrix of the Softmax function and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the Jacobian matrix will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input array </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00567">567</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a4375738c83ef892783abc210578e5b39_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a4375738c83ef892783abc210578e5b39_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a4375738c83ef892783abc210578e5b39_cgraph" id="anamespacenz_1_1krnl_a4375738c83ef892783abc210578e5b39_cgraph">
<area shape="rect" title="Kernel function to compute the Jacobian of the Softmax function." alt="" coords="5,47,179,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="227,5,412,48"/>
<area shape="poly" title=" " alt="" coords="179,45,211,40,212,45,179,50"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="227,72,412,115"/>
<area shape="poly" title=" " alt="" coords="179,70,212,75,211,80,179,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="460,39,645,81"/>
<area shape="poly" title=" " alt="" coords="412,77,444,73,445,78,413,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="460,105,645,148"/>
<area shape="poly" title=" " alt="" coords="413,104,445,109,444,114,412,109"/>
</map>
</div>

</div>
</div>
<a id="aeec286d5351eee7061e151470adb4eef" name="aeec286d5351eee7061e151470adb4eef"></a>
<h2 class="memtitle"><span class="permalink"><a href="#aeec286d5351eee7061e151470adb4eef">&#9670;&#160;</a></span>StochasticGradientDescent()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::StochasticGradientDescent </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>data</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float</td>          <td class="paramname"><span class="paramname"><em>lr</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform Stochastic Gradient Descent (SGD) optimization. </p>
<p>This function updates the data array by applying Stochastic Gradient Descent with the given learning rate and gradient for each element in the input arrays.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">data</td><td>Pointer to the data array that will be updated </td></tr>
    <tr><td class="paramname">grad</td><td>Pointer to the gradient array </td></tr>
    <tr><td class="paramname">lr</td><td>The learning rate used for the gradient update </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the data and gradient arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00642">642</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_aeec286d5351eee7061e151470adb4eef_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_aeec286d5351eee7061e151470adb4eef_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_aeec286d5351eee7061e151470adb4eef_cgraph" id="anamespacenz_1_1krnl_aeec286d5351eee7061e151470adb4eef_cgraph">
<area shape="rect" title="Kernel function to perform Stochastic Gradient Descent (SGD) optimization." alt="" coords="5,39,189,81"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="237,5,422,48"/>
<area shape="poly" title=" " alt="" coords="189,44,221,40,221,45,189,49"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="237,72,422,115"/>
<area shape="poly" title=" " alt="" coords="189,71,221,75,221,80,189,76"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="470,39,655,81"/>
<area shape="poly" title=" " alt="" coords="422,77,454,73,455,78,423,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="470,105,655,148"/>
<area shape="poly" title=" " alt="" coords="423,104,455,109,454,114,422,109"/>
</map>
</div>

</div>
</div>
<a id="a1ae846a65c2f5b83cd1b9fc61b877854" name="a1ae846a65c2f5b83cd1b9fc61b877854"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a1ae846a65c2f5b83cd1b9fc61b877854">&#9670;&#160;</a></span>Summation()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Summation </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>sharedMemSize</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform element-wise summation of two arrays. </p>
<p>This function performs element-wise summation of two input arrays and stores the result in an output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">sharedMemSize</td><td>The size of the shared memory buffer </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the arrays </td></tr>
    <tr><td class="paramname">offset</td><td></td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>This function is used for computing the element-wise summation of two arrays. </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l01225">1225</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a1ae846a65c2f5b83cd1b9fc61b877854_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a1ae846a65c2f5b83cd1b9fc61b877854_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a1ae846a65c2f5b83cd1b9fc61b877854_cgraph" id="anamespacenz_1_1krnl_a1ae846a65c2f5b83cd1b9fc61b877854_cgraph">
<area shape="rect" title="Kernel function to perform element&#45;wise summation of two arrays." alt="" coords="5,47,144,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="192,5,377,48"/>
<area shape="poly" title=" " alt="" coords="144,46,176,41,177,46,145,52"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="192,72,377,115"/>
<area shape="poly" title=" " alt="" coords="145,68,177,74,176,79,144,74"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="425,39,610,81"/>
<area shape="poly" title=" " alt="" coords="377,77,409,73,410,78,378,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="425,105,610,148"/>
<area shape="poly" title=" " alt="" coords="378,104,410,109,409,114,377,109"/>
</map>
</div>

</div>
</div>
<a id="a51a5ff3c8cc2c3051fddf32de294b467" name="a51a5ff3c8cc2c3051fddf32de294b467"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a51a5ff3c8cc2c3051fddf32de294b467">&#9670;&#160;</a></span>SummationExp()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::SummationExp </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>sharedMemSize</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>g_data</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the summation of exponentials of each element in the input array. </p>
<p>This function computes the summation of exponentials of all elements in the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">sharedMemSize</td><td>The size of the shared memory buffer used by the kernel </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the summation of exponentials will be stored </td></tr>
    <tr><td class="paramname">g_data</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input array </td></tr>
    <tr><td class="paramname">offset</td><td></td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00510">510</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a51a5ff3c8cc2c3051fddf32de294b467_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a51a5ff3c8cc2c3051fddf32de294b467_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a51a5ff3c8cc2c3051fddf32de294b467_cgraph" id="anamespacenz_1_1krnl_a51a5ff3c8cc2c3051fddf32de294b467_cgraph">
<area shape="rect" title="Kernel function to compute the summation of exponentials of each element in the input array." alt="" coords="5,47,167,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="215,5,400,48"/>
<area shape="poly" title=" " alt="" coords="166,45,199,40,200,46,167,51"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="215,72,400,115"/>
<area shape="poly" title=" " alt="" coords="167,69,200,74,199,80,166,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="448,39,633,81"/>
<area shape="poly" title=" " alt="" coords="400,77,432,73,433,78,401,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="448,105,633,148"/>
<area shape="poly" title=" " alt="" coords="401,104,433,109,432,114,400,109"/>
</map>
</div>

</div>
</div>
<a id="a997aa5460fd64fadf9b701fbf73e3fb2" name="a997aa5460fd64fadf9b701fbf73e3fb2"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a997aa5460fd64fadf9b701fbf73e3fb2">&#9670;&#160;</a></span>Swish()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Swish </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply the Swish activation function on the GPU. </p>
<p>This function applies the Swish activation function (x * sigmoid(x)) to each element of the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the Swish result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00344">344</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a997aa5460fd64fadf9b701fbf73e3fb2_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a997aa5460fd64fadf9b701fbf73e3fb2_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a997aa5460fd64fadf9b701fbf73e3fb2_cgraph" id="anamespacenz_1_1krnl_a997aa5460fd64fadf9b701fbf73e3fb2_cgraph">
<area shape="rect" title="Kernel function to apply the Swish activation function on the GPU." alt="" coords="5,47,114,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="162,5,347,48"/>
<area shape="poly" title=" " alt="" coords="113,48,146,43,147,48,114,53"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="162,72,347,115"/>
<area shape="poly" title=" " alt="" coords="114,67,147,72,146,77,113,72"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="395,39,580,81"/>
<area shape="poly" title=" " alt="" coords="347,77,379,73,380,78,348,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="395,105,580,148"/>
<area shape="poly" title=" " alt="" coords="348,104,380,109,379,114,347,109"/>
</map>
</div>

</div>
</div>
<a id="a6c5a4b54442aab42df5afe8688e71596" name="a6c5a4b54442aab42df5afe8688e71596"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a6c5a4b54442aab42df5afe8688e71596">&#9670;&#160;</a></span>SwishBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::SwishBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of the Swish activation during backpropagation. </p>
<p>This function computes the gradient of the Swish activation function during backpropagation and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">A_grad</td><td>Pointer to the output array where the gradient result will be stored </td></tr>
    <tr><td class="paramname">A</td><td>Pointer to the input array elements (before activation) </td></tr>
    <tr><td class="paramname">B</td><td>Pointer to the output array elements (after activation) </td></tr>
    <tr><td class="paramname">B_grad</td><td>Pointer to the gradient of the next layer </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00359">359</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a6c5a4b54442aab42df5afe8688e71596_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a6c5a4b54442aab42df5afe8688e71596_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a6c5a4b54442aab42df5afe8688e71596_cgraph" id="anamespacenz_1_1krnl_a6c5a4b54442aab42df5afe8688e71596_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of the Swish activation during backpropagation." alt="" coords="5,47,172,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="220,5,405,48"/>
<area shape="poly" title=" " alt="" coords="172,45,204,40,205,45,173,50"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="220,72,405,115"/>
<area shape="poly" title=" " alt="" coords="173,70,205,75,204,80,172,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="453,39,638,81"/>
<area shape="poly" title=" " alt="" coords="405,77,437,73,438,78,406,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="453,105,638,148"/>
<area shape="poly" title=" " alt="" coords="406,104,438,109,437,114,405,109"/>
</map>
</div>

</div>
</div>
<a id="aeb7d10939b25508e0b5db1fe44f4b467" name="aeb7d10939b25508e0b5db1fe44f4b467"></a>
<h2 class="memtitle"><span class="permalink"><a href="#aeb7d10939b25508e0b5db1fe44f4b467">&#9670;&#160;</a></span>Tanh()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Tanh </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>out</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>in</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to apply the Tanh activation function on the GPU. </p>
<p>This function applies the Tanh activation function (tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))) to each element of the input array and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">out</td><td>Pointer to the output array where the Tanh result will be stored </td></tr>
    <tr><td class="paramname">in</td><td>Pointer to the input array elements </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the input and output arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00289">289</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_aeb7d10939b25508e0b5db1fe44f4b467_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_aeb7d10939b25508e0b5db1fe44f4b467_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_aeb7d10939b25508e0b5db1fe44f4b467_cgraph" id="anamespacenz_1_1krnl_aeb7d10939b25508e0b5db1fe44f4b467_cgraph">
<area shape="rect" title="Kernel function to apply the Tanh activation function on the GPU." alt="" coords="5,47,106,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="154,5,339,48"/>
<area shape="poly" title=" " alt="" coords="106,49,138,43,139,48,107,54"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="154,72,339,115"/>
<area shape="poly" title=" " alt="" coords="107,66,139,72,138,77,106,71"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="387,39,572,81"/>
<area shape="poly" title=" " alt="" coords="339,77,371,73,372,78,340,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="387,105,572,148"/>
<area shape="poly" title=" " alt="" coords="340,104,372,109,371,114,339,109"/>
</map>
</div>

</div>
</div>
<a id="a90d501e72361b7341f36394af0f27c74" name="a90d501e72361b7341f36394af0f27c74"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a90d501e72361b7341f36394af0f27c74">&#9670;&#160;</a></span>TanhBackward()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::TanhBackward </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B_grad</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>n</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to compute the gradient of the Tanh activation during backpropagation. </p>
<p>This function computes the gradient of the Tanh activation function during backpropagation (dL/dx = dL/dy * (1 - tanh(x)^2)) and stores the result in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">A_grad</td><td>Pointer to the output array where the gradient result will be stored </td></tr>
    <tr><td class="paramname">B</td><td>Pointer to the input array elements (after activation) </td></tr>
    <tr><td class="paramname">B_grad</td><td>Pointer to the gradient of the next layer </td></tr>
    <tr><td class="paramname">n</td><td>The number of elements in the arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00302">302</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a90d501e72361b7341f36394af0f27c74_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a90d501e72361b7341f36394af0f27c74_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a90d501e72361b7341f36394af0f27c74_cgraph" id="anamespacenz_1_1krnl_a90d501e72361b7341f36394af0f27c74_cgraph">
<area shape="rect" title="Kernel function to compute the gradient of the Tanh activation during backpropagation." alt="" coords="5,47,164,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="212,5,397,48"/>
<area shape="poly" title=" " alt="" coords="164,45,196,40,196,46,165,51"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="212,72,397,115"/>
<area shape="poly" title=" " alt="" coords="165,69,196,74,196,80,164,75"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="445,39,630,81"/>
<area shape="poly" title=" " alt="" coords="397,77,429,73,430,78,398,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="445,105,630,148"/>
<area shape="poly" title=" " alt="" coords="398,104,430,109,429,114,397,109"/>
</map>
</div>

</div>
</div>
<a id="aa84aa2397f4f5a09a96bef76726e46f0" name="aa84aa2397f4f5a09a96bef76726e46f0"></a>
<h2 class="memtitle"><span class="permalink"><a href="#aa84aa2397f4f5a09a96bef76726e46f0">&#9670;&#160;</a></span>TensorCoreGEMM()</h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::TensorCoreGEMM </td>
          <td>(</td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>A</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>B</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>C</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>M</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>N</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned long long</td>          <td class="paramname"><span class="paramname"><em>K</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to perform fast matrix multiplication using Tensor Cores with half-precision (FP16) support. </p>
<p>This function performs matrix multiplication on two input matrices A and B using Tensor Cores, which are specialized hardware units in modern GPUs designed for high-throughput matrix operations. The matrices are internally padded to be multiples of 16 for efficient computation and then cropped back to their original dimensions after the operation.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">A</td><td>Pointer to the first input matrix (of size M x K) </td></tr>
    <tr><td class="paramname">B</td><td>Pointer to the second input matrix (of size K x N) </td></tr>
    <tr><td class="paramname">C</td><td>Pointer to the result matrix (of size M x N) </td></tr>
    <tr><td class="paramname">M</td><td>The number of rows in matrix A and matrix C </td></tr>
    <tr><td class="paramname">N</td><td>The number of columns in matrix B and matrix C </td></tr>
    <tr><td class="paramname">K</td><td>The number of columns in matrix A and rows in matrix B</td></tr>
  </table>
  </dd>
</dl>
<dl class="section note"><dt>Note</dt><dd>The matrices A and B are assumed to be padded to the nearest multiple of 16 for efficient computation. After the computation, the resulting matrix C will be cropped back to the original dimensions (M x N). </dd></dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00885">885</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_aa84aa2397f4f5a09a96bef76726e46f0_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_aa84aa2397f4f5a09a96bef76726e46f0_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_aa84aa2397f4f5a09a96bef76726e46f0_cgraph" id="anamespacenz_1_1krnl_aa84aa2397f4f5a09a96bef76726e46f0_cgraph">
<area shape="rect" title="Kernel function to perform fast matrix multiplication using Tensor Cores with half&#45;precision (FP16) s..." alt="" coords="5,147,186,173"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1084057ef6f5b2871c60702209bb4469" title="Asynchronously frees the CUDA device memory pointed to by the given pointer." alt="" coords="234,72,419,115"/>
<area shape="poly" title=" " alt="" coords="144,143,235,117,236,122,145,149"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="234,272,419,315"/>
<area shape="poly" title=" " alt="" coords="114,172,166,213,235,258,246,263,243,268,232,262,163,218,111,176"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a97f78a2d43f6e0508c82d4f3b629de96" title="Asynchronously allocates device memory for type&#45;specific data with stream&#45;ordered dependency tracking..." alt="" coords="234,5,419,48"/>
<area shape="poly" title=" " alt="" coords="111,144,163,102,232,58,243,52,246,57,235,62,166,107,114,148"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a71ad766cb2869d3dd6a3931966e81706" title="Asynchronously sets a block of CUDA device memory to a specified value." alt="" coords="234,139,419,181"/>
<area shape="poly" title=" " alt="" coords="186,157,218,157,218,163,186,163"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="234,205,419,248"/>
<area shape="poly" title=" " alt="" coords="145,171,236,198,235,203,144,177"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="467,72,652,115"/>
<area shape="poly" title=" " alt="" coords="419,91,451,91,451,96,419,96"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="467,139,652,181"/>
<area shape="poly" title=" " alt="" coords="404,113,468,131,467,136,402,118"/>
<area shape="poly" title=" " alt="" coords="404,46,468,65,467,70,402,51"/>
<area shape="poly" title=" " alt="" coords="402,136,467,117,468,122,404,141"/>
<area shape="poly" title=" " alt="" coords="419,157,451,157,451,163,419,163"/>
<area shape="poly" title=" " alt="" coords="399,202,417,191,432,175,441,159,449,142,465,125,471,121,474,125,469,129,454,145,445,161,437,178,421,195,402,207"/>
<area shape="poly" title=" " alt="" coords="402,202,467,184,468,189,404,207"/>
</map>
</div>

</div>
</div>
<a id="a16823e30ad99965b64a03e2d4a91a699" name="a16823e30ad99965b64a03e2d4a91a699"></a>
<h2 class="memtitle"><span class="permalink"><a href="#a16823e30ad99965b64a03e2d4a91a699">&#9670;&#160;</a></span>Transpose() <span class="overload">[1/2]</span></h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Transpose </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>d_A</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>d_B</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned int</td>          <td class="paramname"><span class="paramname"><em>rows</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned int</td>          <td class="paramname"><span class="paramname"><em>cols</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">const std::vector&lt; size_t &gt; &amp;</td>          <td class="paramname"><span class="paramname"><em>offset</em></span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to transpose a matrix on the GPU. </p>
<p>This function performs the transposition of a matrix on the GPU, swapping rows and columns. The resulting transposed matrix is stored in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">d_A</td><td>Pointer to the input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">d_B</td><td>Pointer to the output matrix where the transposed result will be stored </td></tr>
    <tr><td class="paramname">rows</td><td>The number of rows in the input matrix </td></tr>
    <tr><td class="paramname">cols</td><td>The number of columns in the input matrix </td></tr>
    <tr><td class="paramname">offset</td><td>The offset within the input and output arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00154">154</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_a16823e30ad99965b64a03e2d4a91a699_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_a16823e30ad99965b64a03e2d4a91a699_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_a16823e30ad99965b64a03e2d4a91a699_cgraph" id="anamespacenz_1_1krnl_a16823e30ad99965b64a03e2d4a91a699_cgraph">
<area shape="rect" title="Kernel function to transpose a matrix on the GPU." alt="" coords="5,13,139,40"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="187,5,372,48"/>
<area shape="poly" title=" " alt="" coords="139,24,171,24,171,29,139,29"/>
</map>
</div>

</div>
</div>
<a id="afe3f38f788c735b7eb718443eb0fd094" name="afe3f38f788c735b7eb718443eb0fd094"></a>
<h2 class="memtitle"><span class="permalink"><a href="#afe3f38f788c735b7eb718443eb0fd094">&#9670;&#160;</a></span>Transpose() <span class="overload">[2/2]</span></h2>

<div class="memitem">
<div class="memproto">
      <table class="memname">
        <tr>
          <td class="memname">void nz::krnl::Transpose </td>
          <td>(</td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>gridDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">dim3</td>          <td class="paramname"><span class="paramname"><em>blockDim</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>d_A</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">float *</td>          <td class="paramname"><span class="paramname"><em>d_B</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned int</td>          <td class="paramname"><span class="paramname"><em>rows</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">unsigned int</td>          <td class="paramname"><span class="paramname"><em>cols</em></span>, </td>
        </tr>
        <tr>
          <td class="paramkey"></td>
          <td></td>
          <td class="paramtype">size_t</td>          <td class="paramname"><span class="paramname"><em>offset</em></span><span class="paramdefsep"> = </span><span class="paramdefval">0</span>&#160;)</td>
        </tr>
      </table>
</div><div class="memdoc">

<p>Kernel function to transpose a matrix on the GPU. </p>
<p>This function performs the transposition of a matrix on the GPU, swapping rows and columns. The resulting transposed matrix is stored in the output array.</p>
<dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">gridDim</td><td>The grid dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">blockDim</td><td>The block dimensions for the CUDA kernel launch configuration </td></tr>
    <tr><td class="paramname">d_A</td><td>Pointer to the input matrix elements stored as a one-dimensional array </td></tr>
    <tr><td class="paramname">d_B</td><td>Pointer to the output matrix where the transposed result will be stored </td></tr>
    <tr><td class="paramname">rows</td><td>The number of rows in the input matrix </td></tr>
    <tr><td class="paramname">cols</td><td>The number of columns in the input matrix </td></tr>
    <tr><td class="paramname">offset</td><td>The offset within the input and output arrays </td></tr>
  </table>
  </dd>
</dl>

<p class="definition">Definition at line <a class="el" href="_operation_kernels_8cu_source.html#l00147">147</a> of file <a class="el" href="_operation_kernels_8cu_source.html">OperationKernels.cu</a>.</p>
<div class="dynheader">
Here is the call graph for this function:</div>
<div class="dyncontent">
<div class="center"><img src="namespacenz_1_1krnl_afe3f38f788c735b7eb718443eb0fd094_cgraph.png" border="0" usemap="#anamespacenz_1_1krnl_afe3f38f788c735b7eb718443eb0fd094_cgraph" alt=""/></div>
<map name="anamespacenz_1_1krnl_afe3f38f788c735b7eb718443eb0fd094_cgraph" id="anamespacenz_1_1krnl_afe3f38f788c735b7eb718443eb0fd094_cgraph">
<area shape="rect" title="Kernel function to transpose a matrix on the GPU." alt="" coords="5,47,139,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#ab4b2eb422e0e1ee44bdfdc0eb94457ce" title="Returns a reference to the singleton instance of the StreamManager." alt="" coords="187,5,372,48"/>
<area shape="poly" title=" " alt="" coords="139,47,171,41,172,47,140,52"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a46ce59b45de432842454aadf00b93791" title="Asynchronously submits a CUDA kernel with stream&#45;ordered dependency management." alt="" coords="187,72,372,115"/>
<area shape="poly" title=" " alt="" coords="140,68,172,73,171,79,139,73"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#a1de1cf3aadea137faf90a2f9b4b7abe2" title="Acquires CUDA stream from pool using round&#45;robin scheduling." alt="" coords="420,39,605,81"/>
<area shape="poly" title=" " alt="" coords="372,77,404,73,405,78,373,83"/>
<area shape="rect" href="classnz_1_1cu_strm_1_1_stream_manager.html#adb1078a67c6e38932d7d58c2adb05ec0" title="Synchronizes CUDA stream execution until data writes complete." alt="" coords="420,105,605,148"/>
<area shape="poly" title=" " alt="" coords="373,104,405,109,404,114,372,109"/>
</map>
</div>

</div>
</div>
</div><!-- contents -->
<!-- start footer part -->
<hr class="footer"/><address class="footer"><small>
Generated by&#160;<a href="https://www.doxygen.org/index.html"><img class="footer" src="doxygen.svg" width="104" height="31" alt="doxygen"/></a> 1.12.0
</small></address>
</div><!-- doc-content -->
</body>
</html>
